# Evaluate Phi-1.5 on code completion

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install tree-sitter
!pip install rouge-score

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime
import torch
import random
import numpy as np
import json
import re
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz
import Levenshtein
import nltk
from rouge_score import rouge_scorer
import tree_sitter

In [None]:
torch.set_default_device("cuda")

print("Loading model...")
time = datetime.datetime.now()
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
time1 = datetime.datetime.now()
print(f"Model loaded. Time to load the model: {time1 - time}")

In [5]:
def replace_tags(code):
    """
    Replaces special tags in the input code with their corresponding literals or empty strings.
    Original function is here:
    https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/CodeCompletion-line/evaluator/evaluator.py

    Parameters:
        code (str): The input code containing special tags.

    Returns:
        str: The code with tags replaced by literals or empty strings.
    """
    # Replace special tags with their corresponding literals or empty strings
    code = code.replace("<NUM_LIT>", "0").replace("<STR_LIT>", "").replace("<CHAR_LIT>", "")

    # Find literals enclosed in special tags and replace them with the literal itself
    pattern = re.compile(r"<(STR|NUM|CHAR)_LIT:(.*?)>", re.S)
    lits = re.findall(pattern, code)
    for lit in lits:
        code = code.replace(f"<{lit[0]}_LIT:{lit[1]}>", lit[1])

    # Find special tags and replace them with empty spaces
    pattern = r'<([A-Z][^<>]*)>'
    liners = re.findall(pattern, code)
    for tag in liners:
        code = code.replace(f'<{tag}>', ' ')

    return code

def read_jsonl_file(file_path):
    """
    Reads a JSONL file and replaces special tags in the 'signature' and 'body' fields of each JSON object.

    Parameters:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, each containing the modified JSON objects.
    """
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            json_obj = json.loads(line)
            json_obj['signature'] = replace_tags(json_obj['signature'])
            json_obj['body'] = replace_tags(json_obj['body'])
            data.append(json_obj)
    return data

file_path = '/content/drive/MyDrive/CodeCompletion/CodeXGlue/test.jsonl'
codexglue_test = read_jsonl_file(file_path)
print(f'{codexglue_test[0]}\n')

columns_to_convert = ['is_single_expression', 'is_test', '0-20', '100+', '20-50', '50-100']

file_path = '/content/drive/MyDrive/CodeCompletion/functions_df_inputs_outputs.csv'
functions_df = pd.read_csv(file_path)
functions_df[columns_to_convert] = functions_df[columns_to_convert].astype(str)
print(f'{functions_df.iloc[0]}\n')

file_path = '/content/drive/MyDrive/CodeCompletion/context_functions_df.csv'
context_functions_df = pd.read_csv(file_path)
context_functions_df[columns_to_convert] = context_functions_df[columns_to_convert].astype(str)
print(f'{context_functions_df.iloc[0]}\n')

{'signature': 'def debug(user, message):', 'body': 'message_user(user, message, constants.DEBUG) ', 'docstring': 'Adds a message with the ``DEBUG`` level.\n\n:param user: User instance\n:param message: Message to show', 'id': 'f4:m0'}

Unnamed: 0                                                              0
function_id                                                         27692
signature               private fun bitIndex(elementIndex: Int, bitOff...
body                    =\n        elementIndex * ELEMENT_SIZE + bitOf...
is_single_expression                                                 True
is_test                                                             False
0-20                                                                False
100+                                                                False
20-50                                                               False
50-100                                                               True
Name: 0, dtype: object



In [None]:
def extract_function_python(code, signature):
    """
    The fastest way to extract function from the output - regex.
    But this approach has some downfalls, s.t. it does not consider nested function
    """
    # Regular expression pattern to match function definitions
    pattern = r"def\s+(\w+)\s*\(.*?\):[\s\S]*?(?=def|\Z)"

    match = re.search(pattern, code)

    if match and not match.group(0).split(signature, 1)[-1].strip() == '':
        return match.group(0)
    else:
        return str(code)

LANGUAGE_BUILDER_PATH = '/content/drive/MyDrive/CodeCompletion/parser/build/my-language.so'
REPO_PATHS = '/content/drive/MyDrive/CodeCompletion/parser/tree-sitter-kotlin'

tree_sitter.Language.build_library(LANGUAGE_BUILDER_PATH, [REPO_PATHS])
KOTLIN_LANGUAGE = tree_sitter.Language(LANGUAGE_BUILDER_PATH, 'kotlin')
parser_kotlin = tree_sitter.Parser()
parser_kotlin.set_language(KOTLIN_LANGUAGE)

def extract_function_kotlin(code, signature):
  pattern = r"fun\s+(\w+)\s*\(.*?\)\s*{[\s\S]*?(?=fun|\Z)"
  match = re.search(pattern, code)

  if match and not match.group(0).split(signature, 1)[-1].strip() == '':
    return match.group(0)
  else:
    tree = parser_kotlin.parse(bytes(code, "utf8"))
    root_node = tree.root_node
    if root_node.children[0].type == 'function_declaration':
      return root_node.children[0].text.decode('utf-8')
    else:
      return str(code)

In [None]:
# Example of usage
code = """
def debug(user, message): print(f'{user} - {message}')
def _check(path):  messageFiles = path.globChildren('*') for message in messageFiles:  if message.basename().endswith(''):  continue  receiver.message(message.getContent()) message.remove() return functools.partial(_check, path) def has_resuming(): return False
"""

# Extract the first function definition
extracted_fun = extract_function_python(code, 'def debug(user, message):')
print(extracted_fun)

# Preparation for evaluation

### Compute average number of tokens
Get mean values of number of tokens depending on length of signature and is_single_expression parameter.

This code tokenizes text in the 'body' column, calculates the mean token count for each group defined by boolean columns, and aggregates the results.


In [8]:
functions_df['tokenized_body'] = functions_df['body'].apply(lambda x: tokenizer.encode(x, add_special_tokens=False))

# Convert bool columns to string for better groupby
bool_cols = ['is_single_expression', 'is_test', '0-20', '100+', '20-50', '50-100']
for col in bool_cols:
    functions_df[col] = functions_df[col].astype(str)

functions_df['num_tokens'] = functions_df['tokenized_body'].apply(len)
mean_tokens_by_prompt = functions_df.groupby(['is_single_expression', 'is_test', '0-20', '100+', '20-50', '50-100'])['num_tokens'].mean().reset_index()
mean_tokens_by_prompt['num_tokens'] = mean_tokens_by_prompt['num_tokens'].astype(int)

print(mean_tokens_by_prompt)

Token indices sequence length is longer than the specified maximum sequence length for this model (12997 > 2048). Running this sequence through the model will result in indexing errors


   is_single_expression is_test   0-20   100+  20-50 50-100  num_tokens
0                 False   False  False  False  False   True          88
1                 False   False  False  False   True  False          62
2                 False   False  False   True  False  False         155
3                 False   False   True  False  False  False          84
4                 False    True  False  False  False   True         234
5                 False    True  False  False   True  False         160
6                 False    True  False   True  False  False         196
7                 False    True   True  False  False  False         125
8                  True   False  False  False  False   True          27
9                  True   False  False  False   True  False          12
10                 True   False  False   True  False  False          56
11                 True   False   True  False  False  False          11
12                 True    True  False  False  False   True     

In [9]:
def get_mean_tokens(row):
  bool_values = row[bool_cols].values.flatten().tolist()
  mean_token_count = mean_tokens_by_prompt[
      (mean_tokens_by_prompt[bool_cols] == bool_values).all(axis=1)
  ]['num_tokens'].iloc[0]
  return mean_token_count

print("Mean token count for the sample:", get_mean_tokens(functions_df.iloc[0]))

Mean token count for the sample: 27


### Functions for constructing correct prompt

In [10]:
def create_prompt_codex(dataset, index, num_examples, context=None, language='Python'):
    """
    Creates a prompt for the CodeX model based on the dataset, index, and optional context.

    Parameters:
        dataset (list): A list of dictionaries representing code examples.
        index (int): The index of the current code example.
        num_examples (int): The number of examples to include in the prompt.
        context (str, optional): Additional context to include in the prompt (default is None).
        language (str, optional): The programming language of the code examples (default is 'Python').

    Returns:
        str: The generated prompt for the CodeX model.
    """
    indices = random.sample(range(len(dataset)), num_examples)
    prefix = f'\nComplete code\nLanguage: {language}\n'
    shots = '\n'.join([f"Example: {dataset[i]['signature']} {dataset[i]['body']}" for i in indices])

    data = dataset[index]
    if context:
        prompt = f"{prefix}\n{shots}\n{context}\nCode so far: {data['signature']}"
    else:
        prompt = f"{prefix}\n{shots}\nCode so far: {data['signature']}"

    return prompt

In [11]:
def create_prompt_kotlin(dataset, index, num_examples, context=None, language='Kotlin'):
    """
    Creates a prompt for Kotlin code completion based on the dataset, index, and optional context.

    Parameters:
        dataset (DataFrame): A DataFrame containing Kotlin code examples.
        index (int): The index of the current code example.
        num_examples (int): The number of examples to include in the prompt.
        context (str, optional): Additional context to include in the prompt (default is None).
        language (str, optional): The programming language of the code examples (default is 'Kotlin').

    Returns:
        str: The generated prompt for Kotlin code completion.
    """
    indices = random.sample(range(len(dataset)), num_examples)
    prefix = f'Complete code\nLanguage: {language}\n'
    shots = '\n'.join([f"Example: {dataset.iloc[i]['signature']} {dataset.iloc[i]['body']}" for i in indices])

    data = dataset.iloc[index]
    if context:
        prompt = f"{prefix}\n{shots}\n{context}\nCode so far: {data['signature']}"
    else:
        prompt = f"{prefix}\n{shots}\nCode so far: {data['signature']}"

    return prompt

### Function for metrics computing **(edit_sim, CHRF Score, ROUGE-1 Score)**

In [12]:
# We don't need stemmer here, because we want to compare full tokens
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=False, tokenizer=tokenizer)

def compute_metrics(ref, text):
    """
    Computes various evaluation metrics between a reference and generated text.

    Parameters:
        ref (str): The reference text.
        text (str): The generated text.

    Returns:
        tuple: A tuple containing the computed metrics (edit_sim, chrf_score, rogue1_precision, rogue1_recall, rogue1_fmeasure).
    """
    edit_sim = fuzz.ratio(text, ref)
    chrf_score = nltk.translate.chrf_score.chrf_precision_recall_fscore_support(ref, text, 4)[2]  # Return only f-score
    rogue_scores = scorer.score(ref, text)['rouge1']
    return edit_sim, chrf_score, rogue_scores.precision, rogue_scores.recall, rogue_scores.fmeasure

def print_metrics(metrics):
  metrics_names = ['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']
  print("Results: ")
  for i in range(len(metrics)):
    if i == 2:
      token = scorer._tokenizer
      print(f"Generated tokens: \n\tGenerated code tokenized: {token(generated_code)['input_ids'][:25]}... \n\tReference code tokenized: {token(reference_code)['input_ids'][:25]}...")
    print(f"{metrics_names[i]}: {metrics[i]}")

In [13]:
# Example of computing metrics
generated_code = '''
def loads(s, strip_comments=False, **kw):
    return [Node.create(s, **kw) for s in s.split('\n') if s]
'''

reference_code = '''
def loads(s, strip_comments=False, **kw):
    kw[''] = strip_comments
    return [parse_node(ss.strip(), **kw) for ss in s.split(';') if ss.strip()]
'''

metrics = compute_metrics(reference_code, generated_code)
print_metrics(metrics)

Results: 
edit_sim: 76
chrf_score: 0.5407303370786517
Generated tokens: 
	Generated code tokenized: [198, 4299, 15989, 7, 82, 11, 10283, 62, 15944, 28, 25101, 11, 12429, 46265, 2599, 198, 50284, 7783, 685, 19667, 13, 17953, 7, 82, 11]... 
	Reference code tokenized: [198, 4299, 15989, 7, 82, 11, 10283, 62, 15944, 28, 25101, 11, 12429, 46265, 2599, 198, 50284, 46265, 58, 7061, 60, 796, 10283, 62, 15944]...
precision: 0.8536585365853658
recall: 0.625
f-measure: 0.7216494845360826


# Evaluation (code without description)

In [14]:
kotlin_data = functions_df[0:10000]

kotlin_data_context = context_functions_df

python_data = codexglue_test[0:10000]

## Evaluate code completion for Python (10_000 of samples)
Code calculates the mean values of code generation metrics for two sets of results: one without context and another with context. Prints the mean values and exports them to CSV files for further analysis.

In [None]:
def evaluation_pipeline_python(args, dataset, descr=False):
    """
    Evaluates Python code generation using a pre-trained model.

    Parameters:
        args (dict): Model configuration parameters.
        dataset (list): List of dictionaries containing code examples.
        descr (bool, optional): Whether to include function descriptions as context (default is False).

    Returns:
        DataFrame: DataFrame containing evaluation results for each code example.
    """
    args_copy = args.copy()
    result_df = pd.DataFrame(dataset)
    new_columns = ['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']
    result_df[new_columns] = 0.0

    for i in tqdm(range(len(dataset))):
        if descr:
            prompt = create_prompt_codex(dataset, i, 2, dataset[i]['docstring'])
        else:
            prompt = create_prompt_codex(dataset, i, 2)
        inputs = tokenizer.encode(prompt, return_tensors="pt")

        # Update max_length based on input prompt length
        args_copy['max_length'] = len(inputs[0]) + args['max_length']

        outputs = model.generate(inputs, **args_copy)

        text = tokenizer.batch_decode(outputs)[0]
        function = text.split("Code so far: ", 1)[-1]
        function = extract_function_python(function, dataset[i]['signature'])
        function_body = function.split(dataset[i]['signature'], 1)[-1]
        metrics = compute_metrics(dataset[i]['body'], function_body)

        result_df.loc[i, new_columns] = metrics

    return result_df

config = {'max_length': 75, 'do_sample': True, 'temperature': 0.7, 'top_k': 5, 'top_p': 0.8, 'num_return_sequences': 1}
result_no_context = evaluation_pipeline_python(config, python_data)
result_context = evaluation_pipeline_python(config, python_data, descr=True)

In [26]:
mean_values_no_context = result_no_context[['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']].mean(axis=0)
print(mean_values_no_context)
mean_values_no_context.to_csv('result_no_context.csv')

mean_values_context = result_context[['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']].mean(axis=0)
print(mean_values_context)
mean_values_context.to_csv('result_context.csv')

edit_sim      31.950000
chrf_score     0.168040
precision      0.287897
recall         0.303383
f-measure      0.224306
dtype: float64
edit_sim      29.230000
chrf_score     0.140260
precision      0.214698
recall         0.273709
f-measure      0.182082
dtype: float64


In [21]:
def evaluation_pipeline_kotlin(args, dataset, descr=False):
    """
    Evaluates Kotlin code generation using a pre-trained model.

    Parameters:
        args (dict): Model configuration parameters.
        dataset (DataFrame): DataFrame containing Kotlin code examples.
        descr (bool, optional): Whether to include function descriptions as context (default is False).

    Returns:
        DataFrame: DataFrame containing evaluation results for each code example.
    """
    args_copy = args.copy()
    result_df = dataset.copy()
    new_columns = ['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']
    result_df[new_columns] = 0.0

    for i in tqdm(range(len(dataset))):
        if descr:
            prompt = create_prompt_kotlin(dataset, i, 2, dataset.iloc[i]['docstring'])
        else:
            prompt = create_prompt_kotlin(dataset, i, 2)
        inputs = tokenizer.encode(prompt, return_tensors="pt")

        # Using mean_tokens_by_prompt generated above, we got much better results (+8% for each metric)
        length = get_mean_tokens(dataset.iloc[i])
        args_copy['max_length'] = len(inputs[0]) + length + 50
        outputs = model.generate(inputs, **args_copy)

        text = tokenizer.batch_decode(outputs)[0]
        function = text.split("Code so far: ", 1)[-1]
        function = extract_function_kotlin(function, dataset.iloc[i]['signature'])
        function_body = function.split(dataset.iloc[i]['signature'], 1)[-1]
        metrics = compute_metrics(dataset.iloc[i]['body'], function_body)

        result_df.loc[i, new_columns] = metrics

    return result_df

## Hyperparameter Search for Code Generation
This script conducts a hyperparameter search to optimize the performance of a code generation model. It iterates over different combinations of parameters for both beam search and sampling methods, evaluates each combination, and identifies the configuration with the highest edit similarity score.

In [None]:
import itertools

existing_config = {'max_length': 0}

possible_beam_params = {'num_beams': [2, 3, 4, 5]}
possible_beam_search_params = {'num_return_sequences': [1, 2, 3], 'length_penalty': [0.8, 0.9, 1.0]}

possible_sampling_params = {'do_sample': [True], 'temperature': [0.7, 0.8, 0.9]}
possible_sampling_search_params = {'top_k': [5, 10, 20], 'top_p': [0.7, 0.8, 0.9], 'num_return_sequences': [1]}

best_score = float('-inf')
best_config = {}

# Search for best parameters for beam search
for beam_params in itertools.product(*possible_beam_params.values()):
    for beam_search_params in itertools.product(*possible_beam_search_params.values()):
        config = existing_config.copy()
        config.update(dict(zip(possible_beam_params.keys(), beam_params)))
        config.update(dict(zip(possible_beam_search_params.keys(), beam_search_params)))
        print(config)

        result = evaluation_pipeline_kotlin(config, functions_df[0:100])
        mean_values_no_context = result[['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']].mean(axis=0)
        print(f"\nEdit sim: {mean_values_no_context['edit_sim']}")
        if mean_values_no_context['edit_sim'] > best_score:
            best_score = mean_values_no_context['edit_sim']
            best_config = config
            print(f"New best value: {mean_values_no_context['edit_sim']} with config {best_config}")

# Search for best parameters for sampling
for sampling_params in itertools.product(*possible_sampling_params.values()):
    for sampling_search_params in itertools.product(*possible_sampling_search_params.values()):
        config = existing_config.copy()
        config.update(dict(zip(possible_sampling_params.keys(), sampling_params)))
        config.update(dict(zip(possible_sampling_search_params.keys(), sampling_search_params)))
        print(config)

        result = evaluation_pipeline_kotlin(config, functions_df[0:100])
        mean_values_no_context = result[['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']].mean(axis=0)
        print(f"\nEdit sim: {mean_values_no_context['edit_sim']}")
        if mean_values_no_context['edit_sim'] > best_score:
            best_score = mean_values_no_context['edit_sim']
            best_config = config
            print(f"New best value: {mean_values_no_context['edit_sim']} with config {best_config}")

# Best config is {'max_length': 0, 'do_sample': True, 'temperature': 0.7, 'top_k': 5, 'top_p': 0.8, 'num_return_sequences': 1}

## Evaluate every type of Kotlin function (4_000 = 250 * 16 samples)
This script analyzes code generation metrics across different function types. It computes the mean values of these metrics for each function type, based on specified criteria, and stores the results in a DataFrame for further analysis.

In [None]:
config = {'max_length': 0, 'do_sample': True, 'temperature': 0.7, 'top_k': 5, 'top_p': 0.8, 'num_return_sequences': 1}

fun_type_col = ['is_single_expression', 'is_test', '0-20', '100+', '20-50', '50-100']
metrics = ['edit_sim', 'chrf_score', 'precision', 'recall', 'f-measure']

results_by_type = pd.DataFrame(columns=fun_type_col + metrics)
grouped = mean_tokens_by_prompt.groupby(fun_type_col)

for name, group in grouped:
  df = functions_df[(functions_df[fun_type_col] == name).all(axis=1)].head(250)
  df = df.reset_index(drop=True)

  result = evaluation_pipeline_kotlin(config, df)
  mean_values = result[metrics].mean(axis=0).tolist()

  results_by_type.loc[len(results_by_type)] = list(name) + mean_values

In [30]:
print(results_by_type)
results_by_type.to_csv('results_by_type1.csv')

   is_single_expression is_test   0-20   100+  20-50 50-100  edit_sim  \
0                 False   False  False  False  False   True     38.08   
1                 False   False  False  False   True  False     29.60   
2                 False   False  False   True  False  False     36.92   
3                 False   False   True  False  False  False     32.32   
4                 False    True  False  False  False   True     31.88   
5                 False    True  False  False   True  False     31.64   
6                 False    True  False   True  False  False     29.96   
7                 False    True   True  False  False  False     21.72   
8                  True   False  False  False  False   True     35.88   
9                  True   False  False  False   True  False     26.44   
10                 True   False  False   True  False  False     33.16   
11                 True   False   True  False  False  False     27.24   
12                 True    True  False  False  Fals