# Function Summarization Evaluation

*Note: This notebook is modified from the CodeSearchNet [ExploreData.ipynb](https://github.com/github/CodeSearchNet/blob/master/notebooks/ExploreData.ipynb) notebook

In [119]:
%pip install datasets transformers sentence_transformers pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [118]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, RobertaTokenizer
from sentence_transformers import SentenceTransformer, util

import pandas as pd

In [98]:
dataset = load_dataset("code_x_glue_ct_code_to_text", "python")

Downloading and preparing dataset code_x_glue_ct_code_to_text/python to /Users/cjwilliams/.cache/huggingface/datasets/code_x_glue_ct_code_to_text/python/0.0.0/f8b7e9d51f609a87e7ec7c7431706d4ee0b402e3398560410313d4acc67060a0...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/251820 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13914 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14918 [00:00<?, ? examples/s]

Dataset code_x_glue_ct_code_to_text downloaded and prepared to /Users/cjwilliams/.cache/huggingface/datasets/code_x_glue_ct_code_to_text/python/0.0.0/f8b7e9d51f609a87e7ec7c7431706d4ee0b402e3398560410313d4acc67060a0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [100]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 14918
    })
})

In [104]:
df = pd.concat([pd.DataFrame(dataset['train']), pd.DataFrame(dataset['validation']), pd.DataFrame(dataset['test'])])
# df = pd.DataFrame(dataset['train'][:20])

In [108]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base-multi-sum")
cosine_model = SentenceTransformer("sentence-transformers/multi-qa-distilbert-cos-v1")
finetuned_model = T5ForConditionalGeneration.from_pretrained(
    "stmnk/codet5-small-code-summarization-python")

In [109]:
def summarize(code_tokens):
    input_ids = tokenizer(' '.join(code_tokens), return_tensors='pt').input_ids
    generated_ids = model.generate(input_ids, max_length=200)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [110]:
def summarize_finetuned(code_tokens):
    input_ids = tokenizer(' '.join(code_tokens), return_tensors='pt').input_ids
    generated_ids = finetuned_model.generate(input_ids, max_length=200)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [111]:
df['summarization'] = df['code_tokens'].apply(summarize)
df['summarization_finetuned'] = df['code_tokens'].apply(summarize_finetuned)

In [112]:
def compare(summarization, docstring):
    embedding_1 = cosine_model.encode(summarization)
    embedding_2 = cosine_model.encode([docstring, ""])
    score = util.dot_score(embedding_1, embedding_2)[0].cpu().tolist()[0]
    return abs(score)

In [113]:
df['score'] = df.apply(lambda x: compare(x.summarization, ' '.join(x.docstring_tokens)), axis=1)
df['finetuned_score'] = df.apply(
    lambda x: compare(x.summarization_finetuned, ' '.join(x.docstring_tokens)), axis=1)

In [116]:
df['score'].mean()

0.5839301273226738

In [117]:
df['finetuned_score'].mean()

0.4003072716295719