In [None]:
from tree_sitter import Language, Parser
from tree_sitter import Node
from datasets import load_dataset
import tree_sitter_python as tspython
from datasets import Dataset
import pandas as pd

Все необходимые библиотеки установлены!


# Подзадача 1: Подготовка набора данных

#### 1.1. Установите и настройте библиотеку tree-sitter для синтаксического анализа кода на Python и необходимые файлы для языка (пакеты tree-sitter и tree-sitter-python).

In [2]:
PYTHON_LANGUAGE = Language(tspython.language())
parser = Parser(PYTHON_LANGUAGE)

#### 1.2 Загрузите данные из CodeSearchNet и проанализируйте их структуру (используйте load_dataset из библиотеки datasets из экосистемы HuggingFace, потребуется использовать флаг trust_remote_code=True, не забудьте выбрать язык python, в качестве аргумента splitможно сразу передать test). В процессе выполнения задания можно ограничить количество примеров, например, до первой 1000.

In [3]:
dataset = load_dataset("code_search_net", "python", split="test", trust_remote_code=True)
subset = dataset.select(range(1000))

In [4]:
data = subset

#### С помощью библиотеки tree-sitter выполните синтаксический разбор полей whole_func_string и реализуйте извлечения следующих элементов из построенных деревьев:
- имя функции
- тело функции без комментариев и документации
- тело функции с комментариями с документацией (docstrings)

In [43]:
def extract_function_elements(code):
    tree = parser.parse(code.encode("utf8"))
    root_node = tree.root_node

    def extract_name_and_body(node: Node):
        if node.type == "function_definition":
            # Имя функции
            name_node = node.child_by_field_name("name")
            function_name = name_node.text.decode("utf8") if name_node else None

            # Тело функции с комментариями
            body_node = node.child_by_field_name("body")
            function_body_with_comments = code[body_node.start_byte:body_node.end_byte] if body_node else None

            # Инициализация docstring и комментариев
            docstring = None
            comments = []

            # Проверка строки документации как дочернего узла функции
            string_node = node.child_by_field_name("string")
            if string_node:
                docstring = code[string_node.start_byte:string_node.end_byte].strip()

            # Если строки документации нет, проверяем первый узел тела
            if not docstring and body_node and body_node.child_count > 0:
                for first_child in body_node.children:
                    if first_child.type == "string":
                        docstring = code[first_child.start_byte:first_child.end_byte].strip()
                    else:
                        for first_child in first_child.children:
                            if first_child.type == "string":
                                docstring = code[first_child.start_byte:first_child.end_byte].strip()



            # Извлечение комментариев из тела функции
            def extract_comments(node: Node):
                for child in node.children:
                    if child.type == "comment":
                        comments.append(code[child.start_byte:child.end_byte].strip())
                    else:
                        extract_comments(child)

            if body_node:
                extract_comments(body_node)

            # Удаление комментариев и строк документации с сохранением структуры
            def remove_comments_and_docstrings_with_formatting(node: Node):
                result = []
                for child in node.children:
                    if child.type in {"comment", "string"}:
                        continue
                    elif len(child.children) > 0:
                        result.append(remove_comments_and_docstrings_with_formatting(child))
                    else:
                        result.append(code[child.start_byte:child.end_byte])
                return "".join(result)

            function_body_without_comments = (
                remove_comments_and_docstrings_with_formatting(body_node).strip() if body_node else None
            )

            function_body_without_comments = function_body_with_comments.replace(docstring, "")
            for comment in comments:
                function_body_without_comments = function_body_without_comments.replace(comment, "")

            return function_name, function_body_with_comments, function_body_without_comments, docstring.replace('"', '').strip(), comments

        for child in node.children:
            result = extract_name_and_body(child)
            if result:
                return result
        return None

    return extract_name_and_body(root_node)


#### Добавьте дополнительные поля в исходный датасет с вышеуказанными элементами

In [45]:
processed_data = []
for example in subset:
    code = example["whole_func_string"]
    result = extract_function_elements(code)
    if result:
        function_name, function_body_with_comments, function_body_without_comments, docstring, comments = result
        example["extracted_func_name"] = function_name
        example["function_body_with_comments"] = function_body_with_comments
        example["function_body_without_comments"] = function_body_without_comments
        example["docstring"] = docstring
        example["comments"] = comments
    processed_data.append(example)

df = pd.DataFrame(processed_data)

#### Приведите примеры извлеченных функций

In [46]:
df

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,extracted_func_name,function_body_with_comments,function_body_without_comments,docstring,comments
0,soimort/you-get,src/you_get/extractors/youtube.py,YouTube.get_vid_from_url,"def get_vid_from_url(url):\n """"""Extract...",python,"def get_vid_from_url(url):\n """"""Extract...","[def, get_vid_from_url, (, url, ), :, return, ...",Extracts video ID from URL.,"[Extracts, video, ID, from, URL, .]",test,https://github.com/soimort/you-get/blob/b746ac...,get_vid_from_url,"""""""Extracts video ID from URL.\n """"""\n ...","\n return match1(url, r'youtu\.be/([^?/...",Extracts video ID from URL.,[]
1,soimort/you-get,src/you_get/extractors/miomio.py,sina_xml_to_url_list,"def sina_xml_to_url_list(xml_data):\n """"""st...",python,"def sina_xml_to_url_list(xml_data):\n """"""st...","[def, sina_xml_to_url_list, (, xml_data, ), :,...",str->list\n Convert XML to URL List.\n F...,"[str, -, >, list, Convert, XML, to, URL, List,...",test,https://github.com/soimort/you-get/blob/b746ac...,sina_xml_to_url_list,"""""""str->list\n Convert XML to URL List.\n ...",\n rawurl = []\n dom = parseString(xml_d...,str->list\n Convert XML to URL List.\n F...,[]
2,soimort/you-get,src/you_get/extractors/fc2video.py,makeMimi,"def makeMimi(upid):\n """"""From http://cdn37....",python,"def makeMimi(upid):\n """"""From http://cdn37....","[def, makeMimi, (, upid, ), :, strSeed, =, ""gG...",From http://cdn37.atwikiimg.com/sitescript/pub...,"[From, http, :, //, cdn37, ., atwikiimg, ., co...",test,https://github.com/soimort/you-get/blob/b746ac...,makeMimi,"""""""From http://cdn37.atwikiimg.com/sitescript/...","\n strSeed = ""gGddgPfeaf_gzyr""\n prehash...",From http://cdn37.atwikiimg.com/sitescript/pub...,[]
3,soimort/you-get,src/you_get/extractors/fc2video.py,fc2video_download,"def fc2video_download(url, output_dir = '.', m...",python,"def fc2video_download(url, output_dir = '.', m...","[def, fc2video_download, (, url, ,, output_dir...",wrapper,[wrapper],test,https://github.com/soimort/you-get/blob/b746ac...,fc2video_download,"""""""wrapper""""""\n #'http://video.fc2.com/en/c...",\n \n \n \n \n hostname = urlpa...,wrapper,[#'http://video.fc2.com/en/content/20151021bTV...
4,soimort/you-get,src/you_get/extractors/dailymotion.py,dailymotion_download,"def dailymotion_download(url, output_dir='.', ...",python,"def dailymotion_download(url, output_dir='.', ...","[def, dailymotion_download, (, url, ,, output_...",Downloads Dailymotion videos by URL.,"[Downloads, Dailymotion, videos, by, URL, .]",test,https://github.com/soimort/you-get/blob/b746ac...,dailymotion_download,"""""""Downloads Dailymotion videos by URL.\n ""...",\n\n html = get_content(rebuilt_url(url))\n...,Downloads Dailymotion videos by URL.,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,tensorflow/probability,tensorflow_probability/python/sts/semilocal_li...,semilocal_linear_trend_transition_matrix,def semilocal_linear_trend_transition_matrix(a...,python,def semilocal_linear_trend_transition_matrix(a...,"[def, semilocal_linear_trend_transition_matrix...",Build the transition matrix for a semi-local l...,"[Build, the, transition, matrix, for, a, semi,...",test,https://github.com/tensorflow/probability/blob...,semilocal_linear_trend_transition_matrix,"""""""Build the transition matrix for a semi-loca...",\n \n \n \n \n \n \n \n \n\n fixed_en...,Build the transition matrix for a semi-local l...,[# We want to write the following 2 x 2 matrix...
996,tensorflow/probability,tensorflow_probability/python/sts/semilocal_li...,semilocal_linear_trend_transition_noise,def semilocal_linear_trend_transition_noise(le...,python,def semilocal_linear_trend_transition_noise(le...,"[def, semilocal_linear_trend_transition_noise,...",Build the transition noise model for a semi-lo...,"[Build, the, transition, noise, model, for, a,...",test,https://github.com/tensorflow/probability/blob...,semilocal_linear_trend_transition_noise,"""""""Build the transition noise model for a semi...",\n\n \n \n broadcast_batch_shape = dist_uti...,Build the transition noise model for a semi-lo...,"[# At each timestep, the stochasticity of `lev..."
997,tensorflow/probability,tensorflow_probability/python/mcmc/sample_halt...,sample_halton_sequence,"def sample_halton_sequence(dim,\n ...",python,"def sample_halton_sequence(dim,\n ...","[def, sample_halton_sequence, (, dim, ,, num_r...","r""""""Returns a sample from the `dim` dimensiona...","[r, Returns, a, sample, from, the, dim, dimens...",test,https://github.com/tensorflow/probability/blob...,sample_halton_sequence,"r""""""Returns a sample from the `dim` dimensiona...",\n if dim < 1 or dim > _MAX_DIMENSION:\n r...,rReturns a sample from the `dim` dimensional H...,"[# Here and in the following, the shape layout..."
998,tensorflow/probability,tensorflow_probability/python/mcmc/sample_halt...,_randomize,"def _randomize(coeffs, radixes, seed=None):\n ...",python,"def _randomize(coeffs, radixes, seed=None):\n ...","[def, _randomize, (, coeffs, ,, radixes, ,, se...",Applies the Owen (2017) randomization to the c...,"[Applies, the, Owen, (, 2017, ), randomization...",test,https://github.com/tensorflow/probability/blob...,_randomize,"""""""Applies the Owen (2017) randomization to th...",\n given_dtype = coeffs.dtype\n coeffs = tf....,Applies the Owen (2017) randomization to the c...,[]


#### Для проверки корректности извлечения сравните извлеченные имена с полем func_name

In [8]:
matchesName = (df["func_name"].str.split(".").str[-1] == df["extracted_func_name"]).sum()
matchesDoc = (df["func_documentation_string"] == df["docstring"]).sum()

print(f"Количество совпадений имен: {matchesName}")
print(f"Количество совпадений документации: {matchesDoc}")

Количество совпадений имен: 1000
Количество совпадений документации: 920


In [146]:
df[df["func_documentation_string"] != df["docstring"]][['func_documentation_string', 'docstring']]

Unnamed: 0,func_documentation_string,docstring
6,video page,'Cannot find any URL of such class!'
7,course page,'No part found!'
12,Get item_id,'''Get item_id'''
21,Source: Android mobile,'''Source: Android mobile'''
24,try:\n # normal Vimeo video\n ht...,'''\n try:\n # normal Vimeo video\n ...
...,...,...
971,Batched KL divergence `KL(a || b)` for Indepen...,Batched KL divergence `KL(a || b)` for Indepen...
974,"r""""""A lower bound on the entropy of this mixtu...",rA lower bound on the entropy of this mixture ...
980,Batchwise KL divergence KL(d1 || d2) with d1 a...,Batchwise KL divergence KL(d1 || d2) with d1 a...
992,Calculate the batched KL divergence KL(n_a || ...,Calculate the batched KL divergence KL(n_a || ...


В основном проблема из-за ковычек, думаю это не будет большой проблемой, при предскозании имени

#### Пример на sina_xml_to_url_list

In [44]:
code_example = """
def sina_xml_to_url_list(xml_data):
    \"\"\"str->list
    Convert XML to URL List.
    From Biligrab.
    \"\"\"
    rawurl = []
    # Comment1
    # Comment 2
    dom = parseString(xml_data)
    for node in dom.getElementsByTagName('durl'):
        url = node.getElementsByTagName('url')[0]  # Comment 3
        rawurl.append(url.childNodes[0].data)
    return rawurl
"""

# Извлечение элементов
result = extract_function_elements(code_example)
print("Function Name:", result[0])
print("=" * 40)
print("Function Body With Comments:\n", result[1])
print("=" * 40)
print("Function Body Without Comments:\n", result[2])
print("=" * 40)
print("Doc:\n", result[3])
print("=" * 40)
print("Comments:\n", result[4])

Function Name: sina_xml_to_url_list
Function Body With Comments:
 """str->list
    Convert XML to URL List.
    From Biligrab.
    """
    rawurl = []
    # Comment1
    # Comment 2
    dom = parseString(xml_data)
    for node in dom.getElementsByTagName('durl'):
        url = node.getElementsByTagName('url')[0]  # Comment 3
        rawurl.append(url.childNodes[0].data)
    return rawurl
Function Body Without Comments:
 
    rawurl = []
    
    
    dom = parseString(xml_data)
    for node in dom.getElementsByTagName('durl'):
        url = node.getElementsByTagName('url')[0]  
        rawurl.append(url.childNodes[0].data)
    return rawurl
Doc:
 str->list
    Convert XML to URL List.
    From Biligrab.
Comments:
 ['# Comment1', '# Comment 2', '# Comment 3']


# Подзадача 2: Использование предобученных моделей для предсказания имен функций

## Использование предобученных моделей только на исходном коде

#### Загрузите предобученную модель (например, CodeT5+)

In [30]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Загрузка модели и токенизатора CodeBERT
model_name = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

#### Используйте подготовленный на первом этапе датасет, содержащий только тела функций без каких либо комментариев и их имена, для предсказания имен функций.

In [None]:
def prepare_input(function_body):
    return "def <extra_id_0>(): \n" + function_body

def predict_function_name(function_body):
    input_text = prepare_input(function_body)
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    
    outputs = model.generate(**inputs, max_length=10, num_beams=5)
    predicted_name = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return predicted_name.replace("<extra_id_0>", "").strip()

In [61]:
df['predicted_func_name'] = df['function_body_without_comments'].apply(predict_function_name)
df

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,extracted_func_name,function_body_with_comments,function_body_without_comments,docstring,comments,predicted_func_name
0,soimort/you-get,src/you_get/extractors/youtube.py,YouTube.get_vid_from_url,"def get_vid_from_url(url):\n """"""Extract...",python,"def get_vid_from_url(url):\n """"""Extract...","[def, get_vid_from_url, (, url, ), :, return, ...",Extracts video ID from URL.,"[Extracts, video, ID, from, URL, .]",test,https://github.com/soimort/you-get/blob/b746ac...,get_vid_from_url,"""""""Extracts video ID from URL.\n """"""\n ...","\n return match1(url, r'youtu\.be/([^?/...",Extracts video ID from URL.,[],match_youtu_be
1,soimort/you-get,src/you_get/extractors/miomio.py,sina_xml_to_url_list,"def sina_xml_to_url_list(xml_data):\n """"""st...",python,"def sina_xml_to_url_list(xml_data):\n """"""st...","[def, sina_xml_to_url_list, (, xml_data, ), :,...",str->list\n Convert XML to URL List.\n F...,"[str, -, >, list, Convert, XML, to, URL, List,...",test,https://github.com/soimort/you-get/blob/b746ac...,sina_xml_to_url_list,"""""""str->list\n Convert XML to URL List.\n ...",\n rawurl = []\n dom = parseString(xml_d...,str->list\n Convert XML to URL List.\n F...,[],get_rawurl
2,soimort/you-get,src/you_get/extractors/fc2video.py,makeMimi,"def makeMimi(upid):\n """"""From http://cdn37....",python,"def makeMimi(upid):\n """"""From http://cdn37....","[def, makeMimi, (, upid, ), :, strSeed, =, ""gG...",From http://cdn37.atwikiimg.com/sitescript/pub...,"[From, http, :, //, cdn37, ., atwikiimg, ., co...",test,https://github.com/soimort/you-get/blob/b746ac...,makeMimi,"""""""From http://cdn37.atwikiimg.com/sitescript/...","\n strSeed = ""gGddgPfeaf_gzyr""\n prehash...",From http://cdn37.atwikiimg.com/sitescript/pub...,[],generate_prehash
3,soimort/you-get,src/you_get/extractors/fc2video.py,fc2video_download,"def fc2video_download(url, output_dir = '.', m...",python,"def fc2video_download(url, output_dir = '.', m...","[def, fc2video_download, (, url, ,, output_dir...",wrapper,[wrapper],test,https://github.com/soimort/you-get/blob/b746ac...,fc2video_download,"""""""wrapper""""""\n #'http://video.fc2.com/en/c...",\n \n \n \n \n hostname = urlpa...,wrapper,[#'http://video.fc2.com/en/content/20151021bTV...,fc2video_download_by
4,soimort/you-get,src/you_get/extractors/dailymotion.py,dailymotion_download,"def dailymotion_download(url, output_dir='.', ...",python,"def dailymotion_download(url, output_dir='.', ...","[def, dailymotion_download, (, url, ,, output_...",Downloads Dailymotion videos by URL.,"[Downloads, Dailymotion, videos, by, URL, .]",test,https://github.com/soimort/you-get/blob/b746ac...,dailymotion_download,"""""""Downloads Dailymotion videos by URL.\n ""...",\n\n html = get_content(rebuilt_url(url))\n...,Downloads Dailymotion videos by URL.,[],get_site_infoif
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,tensorflow/probability,tensorflow_probability/python/sts/semilocal_li...,semilocal_linear_trend_transition_matrix,def semilocal_linear_trend_transition_matrix(a...,python,def semilocal_linear_trend_transition_matrix(a...,"[def, semilocal_linear_trend_transition_matrix...",Build the transition matrix for a semi-local l...,"[Build, the, transition, matrix, for, a, semi,...",test,https://github.com/tensorflow/probability/blob...,semilocal_linear_trend_transition_matrix,"""""""Build the transition matrix for a semi-loca...",\n \n \n \n \n \n \n \n \n\n fixed_en...,Build the transition matrix for a semi-local l...,[# We want to write the following 2 x 2 matrix...,get_autoregressive_
996,tensorflow/probability,tensorflow_probability/python/sts/semilocal_li...,semilocal_linear_trend_transition_noise,def semilocal_linear_trend_transition_noise(le...,python,def semilocal_linear_trend_transition_noise(le...,"[def, semilocal_linear_trend_transition_noise,...",Build the transition noise model for a semi-lo...,"[Build, the, transition, noise, model, for, a,...",test,https://github.com/tensorflow/probability/blob...,semilocal_linear_trend_transition_noise,"""""""Build the transition noise model for a semi...",\n\n \n \n broadcast_batch_shape = dist_uti...,Build the transition noise model for a semi-lo...,"[# At each timestep, the stochasticity of `lev...",multivariate_normal_diag_
997,tensorflow/probability,tensorflow_probability/python/mcmc/sample_halt...,sample_halton_sequence,"def sample_halton_sequence(dim,\n ...",python,"def sample_halton_sequence(dim,\n ...","[def, sample_halton_sequence, (, dim, ,, num_r...","r""""""Returns a sample from the `dim` dimensiona...","[r, Returns, a, sample, from, the, dim, dimens...",test,https://github.com/tensorflow/probability/blob...,sample_halton_sequence,"r""""""Returns a sample from the `dim` dimensiona...",\n if dim < 1 or dim > _MAX_DIMENSION:\n r...,rReturns a sample from the `dim` dimensional H...,"[# Here and in the following, the shape layout...",_sample ( num_results sequence
998,tensorflow/probability,tensorflow_probability/python/mcmc/sample_halt...,_randomize,"def _randomize(coeffs, radixes, seed=None):\n ...",python,"def _randomize(coeffs, radixes, seed=None):\n ...","[def, _randomize, (, coeffs, ,, radixes, ,, se...",Applies the Owen (2017) randomization to the c...,"[Applies, the, Owen, (, 2017, ), randomization...",test,https://github.com/tensorflow/probability/blob...,_randomize,"""""""Applies the Owen (2017) randomization to th...",\n given_dtype = coeffs.dtype\n coeffs = tf....,Applies the Owen (2017) randomization to the c...,[],_get_permuted_coeffs


In [67]:
import evaluate

exact_match = evaluate.load('exact_match')
rouge = evaluate.load('rouge')

exact_match_result = exact_match.compute(
    references=df['extracted_func_name'].tolist(),
    predictions=df['predicted_func_name'].tolist()
)

rouge_result = rouge.compute(
    references=df['extracted_func_name'].tolist(),
    predictions=df['predicted_func_name'].tolist()
)

print("Exact Match:", exact_match_result)
print("ROUGE:", rouge_result)

Exact Match: {'exact_match': 0.075}
ROUGE: {'rouge1': 0.3679215007215002, 'rouge2': 0.16248452380952388, 'rougeL': 0.3651315656565655, 'rougeLsum': 0.36552857142857087}


## Использование предобученных моделей на исходном коде, документации и комментариях

In [69]:
df_with_doc = df[['function_body_with_comments', 'extracted_func_name']]
df_with_doc

Unnamed: 0,function_body_with_comments,extracted_func_name
0,"""""""Extracts video ID from URL.\n """"""\n ...",get_vid_from_url
1,"""""""str->list\n Convert XML to URL List.\n ...",sina_xml_to_url_list
2,"""""""From http://cdn37.atwikiimg.com/sitescript/...",makeMimi
3,"""""""wrapper""""""\n #'http://video.fc2.com/en/c...",fc2video_download
4,"""""""Downloads Dailymotion videos by URL.\n ""...",dailymotion_download
...,...,...
995,"""""""Build the transition matrix for a semi-loca...",semilocal_linear_trend_transition_matrix
996,"""""""Build the transition noise model for a semi...",semilocal_linear_trend_transition_noise
997,"r""""""Returns a sample from the `dim` dimensiona...",sample_halton_sequence
998,"""""""Applies the Owen (2017) randomization to th...",_randomize


In [71]:
df_with_doc['predicted_func_name'] = df_with_doc['function_body_with_comments'].apply(predict_function_name)
df_with_doc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_doc['predicted_func_name'] = df_with_doc['function_body_with_comments'].apply(predict_function_name)


Unnamed: 0,function_body_with_comments,extracted_func_name,predicted_func_name
0,"""""""Extracts video ID from URL.\n """"""\n ...",get_vid_from_url,get_video_id_from
1,"""""""str->list\n Convert XML to URL List.\n ...",sina_xml_to_url_list,parse_xml_to_url
2,"""""""From http://cdn37.atwikiimg.com/sitescript/...",makeMimi,makeMimiLocal
3,"""""""wrapper""""""\n #'http://video.fc2.com/en/c...",fc2video_download,fc2video_download_by
4,"""""""Downloads Dailymotion videos by URL.\n ""...",dailymotion_download,download_urls_by_url
...,...,...,...
995,"""""""Build the transition matrix for a semi-loca...",semilocal_linear_trend_transition_matrix,build_semi_local_
996,"""""""Build the transition noise model for a semi...",semilocal_linear_trend_transition_noise,SemiLocalLinearTrend
997,"r""""""Returns a sample from the `dim` dimensiona...",sample_halton_sequence,getHaltonSamplethe
998,"""""""Applies the Owen (2017) randomization to th...",_randomize,_apply_ownen_2017


In [73]:
exact_match_result = exact_match.compute(
    references=df_with_doc['extracted_func_name'].tolist(),
    predictions=df_with_doc['predicted_func_name'].tolist()
)

rouge_result = rouge.compute(
    references=df_with_doc['extracted_func_name'].tolist(),
    predictions=df_with_doc['predicted_func_name'].tolist()
)

print("Exact Match:", exact_match_result)
print("ROUGE:", rouge_result)

Exact Match: {'exact_match': 0.029}
ROUGE: {'rouge1': 0.4327742063492059, 'rouge2': 0.1904126984126986, 'rougeL': 0.4293946248196242, 'rougeLsum': 0.4292782828282824}


# Отчет

Метрика ROUGE, которая свидетельствует о хорошей схожести текста. Что говорит что в целом модуль понимает название функции. Метрика для кода с комментариями лучше, так как в комментариях и документации как раз содержится описание того что происходит, поэтому модель лучше предсказует смысл

Однака есть проблшемы с метриками полного совпадения. Думаю ее можно решить дообучив модель именно для питона, так как просматрев результаты видно что для некоторых модель выдает правильное имя, но в CamelCase. Так же думаю повлияло отсутствие параметров функции в теле для анализа, так в них могут содержаться нужные название основных объектов с которыми взаимодействует модель

# Для Java

In [109]:
import tree_sitter_java as tsjava

JAVA_LANGUAGE = Language(tsjava.language())
parser = Parser(JAVA_LANGUAGE)

In [110]:
dataset = load_dataset("code_search_net", "java", split="test", trust_remote_code=True)
subset = dataset.select(range(1000))

java.zip:   2%|1         | 21.0M/1.06G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/454451 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/26909 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15328 [00:00<?, ? examples/s]

In [None]:
def replace_function_name_with_marker(func_code_string, new_name="<extra_id_0>"):
    tree = parser.parse(bytes(func_code_string, "utf8"))
    
    cursor = tree.walk()
    
    while cursor.goto_first_child():
        node = cursor.node
        
        if node.type in ['method_declaration', 'function_declaration']:
            function_name_node = node.child_by_field_name('name')
            if function_name_node:

                function_name = function_name_node.text.decode('utf-8')
                start_byte = function_name_node.start_byte
                end_byte = function_name_node.end_byte
                
                modified_func_code = func_code_string[:start_byte] + new_name + func_code_string[end_byte:]
                return modified_func_code, function_name

        cursor.goto_next_sibling()

    return func_code_string, "" 

In [151]:
processed_data = []

for example in subset:
    code = example["whole_func_string"]
    result, function_name = replace_function_name_with_marker(code)
    if result:
        example["test_func"] = result
        example["test_func_name"] = function_name
    processed_data.append(example)

df = pd.DataFrame(processed_data)
df = df[["test_func", "test_func_name"]]
df

Unnamed: 0,test_func,test_func_name
0,"protected final void <extra_id_0>(U value, boo...",fastPathOrderedEmit
1,@CheckReturnValue\n @NonNull\n @Schedule...,amb
2,"@SuppressWarnings(""unchecked"")\n @CheckRetu...",ambArray
3,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",concat
4,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",concat
...,...,...
995,"@SuppressWarnings(""ConstantConditions"") // Gua...",create
996,"@SuppressWarnings(""ConstantConditions"") // Gua...",createWithScheduler
997,"@SuppressWarnings(""ConstantConditions"") // Gua...",create
998,"static <ResponseT, ReturnT> HttpServiceMethod<...",parseAnnotations


In [None]:
import re

def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9]', '', text)
    return cleaned_text

def predict_function_name(function_body):
    inputs = tokenizer(function_body, return_tensors="pt", truncation=True, padding=True)
    
    outputs = model.generate(**inputs, max_length=10, num_beams=5)
    predicted_name = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return remove_special_characters(predicted_name)

In [171]:
df['predicted_func_name'] = df['test_func'].apply(predict_function_name)
df

Unnamed: 0,test_func,test_func_name,predicted_func_name
0,"protected final void <extra_id_0>(U value, boo...",fastPathOrderedEmit,drainLoopifq
1,@CheckReturnValue\n @NonNull\n @Schedule...,amb,amb
2,"@SuppressWarnings(""unchecked"")\n @CheckRetu...",ambArray,ofObservableT
3,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",concat,concatMapDelayError
4,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",concat,concatMap
...,...,...,...
995,"@SuppressWarnings(""ConstantConditions"") // Gua...",create,getGsonConverterFactory
996,"@SuppressWarnings(""ConstantConditions"") // Gua...",createWithScheduler,newRxJavaCallAdapterFactory
997,"@SuppressWarnings(""ConstantConditions"") // Gua...",create,getJacksonConverterFactory
998,"static <ResponseT, ReturnT> HttpServiceMethod<...",parseAnnotations,createHttpServiceMethodif


In [172]:
import evaluate

exact_match = evaluate.load('exact_match')
rouge = evaluate.load('rouge')

exact_match_result = exact_match.compute(
    references=df['test_func_name'].tolist(),
    predictions=df['predicted_func_name'].tolist()
)

rouge_result = rouge.compute(
    references=df['test_func_name'].tolist(),
    predictions=df['predicted_func_name'].tolist()
)

print("Exact Match:", exact_match_result)
print("ROUGE:", rouge_result)

Exact Match: {'exact_match': 0.409}
ROUGE: {'rouge1': 0.41, 'rouge2': 0.0, 'rougeL': 0.41, 'rougeLsum': 0.408}


Для Java метрики получились лучше, особенно exact_match. Думаю это связано с тем что Java более многословный язык, следовательно у модели больше данных для предсказания функции