In [3]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3.9 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


Import all necessary packages

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm

import numpy as np

import json

As repository, I decided to choose a solution of a test task about code completion I did 6 months ago. You can access this repository [here](https://github.com/PioneerAlexander/Improving-LLMs-on-underrepresented-programming-languages?tab=readme-ov-file).

In [3]:
!wget https://github.com/PioneerAlexander/Improving-LLMs-on-underrepresented-programming-languages/archive/refs/heads/main.zip
!unzip main.zip

%mv Improving-LLMs-on-underrepresented-programming-languages-main repo

--2024-11-02 17:45:06--  https://github.com/PioneerAlexander/Improving-LLMs-on-underrepresented-programming-languages/archive/refs/heads/main.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/PioneerAlexander/Improving-LLMs-on-underrepresented-programming-languages/zip/refs/heads/main [following]
--2024-11-02 17:45:06--  https://codeload.github.com/PioneerAlexander/Improving-LLMs-on-underrepresented-programming-languages/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 140.82.121.10
Connecting to codeload.github.com (codeload.github.com)|140.82.121.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [ <=>                ] 479.79K  --.-KB/s    in 0.04s   

2024-11-02 17:45:06 (12.4 MB/s) - ‘main.zip’ s

## Preprocess the code before splitting it into three parts.

In [4]:
from code_preprocess_utils import delete_non_py_files, move_files_and_cleanup

directory_path = 'repo/'

delete_non_py_files(directory_path) # keep only python non-empty files
move_files_and_cleanup(directory_path) # move files to the root directory and delete empty directories

Deleted: repo/README.md
Deleted: repo/requirements.txt
Deleted: repo/kt_filenames.txt
Deleted: repo/src/__init__.py
Skipped: repo/src/parser/utils.py
Skipped: repo/src/parser/parser.py
Deleted: repo/src/parser/__init__.py
Skipped: repo/src/dataset/train_test_dataset_split.py
Skipped: repo/src/dataset/KotlinCodeCompletionDataset.py
Skipped: repo/src/dataset/preprocess.py
Skipped: repo/src/dataset/CodeXGLUETestDataset.py
Deleted: repo/src/dataset/__init__.py
Skipped: repo/src/dataset/FinetuningDataset.py
Skipped: repo/src/model/save_phi-1_5_pretrained.py
Skipped: repo/src/model/utils.py
Skipped: repo/src/model/eval_model.py
Skipped: repo/src/model/metrics.py
Skipped: repo/src/model/finetune_model_peft.py
Deleted: repo/src/model/__init__.py
Deleted: repo/configs/finetune_phi-1_5.yaml
Deleted: repo/test/__init__.py
Skipped: repo/test/dataset/test_preprocess.py
Skipped: repo/test/dataset/test_dataset.py
Deleted: repo/test/dataset/__init__.py
Moved: repo/src/parser/utils.py -> repo/
Moved: r

In [6]:
# Number of lines from file to be used as context
CONTEXT_LENGTH = 8

In [7]:
from split_python_files import split_python_files

corpus = split_python_files(directory_path, num_splits=4, context_length=CONTEXT_LENGTH)

with open('corpus.json', 'w', encoding='utf-8') as f:
    json.dump(corpus, f, ensure_ascii=False, indent=4)

In [8]:
print(f"Collected corpus length: {len(corpus)}")

Collected corpus length: 48


In [None]:
checkpoint = "codellama/CodeLlama-7b-hf"
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [10]:
special_tokens = tokenizer.all_special_tokens

print(f"Model {checkpoint} special_tokens: {special_tokens}")


prefix_token = special_tokens[3]
middle_token = special_tokens[4]
suffix_token = special_tokens[5]

Model codellama/CodeLlama-7b-hf special_tokens: ['<s>', '</s>', '<unk>', '▁<PRE>', '▁<MID>', '▁<SUF>', '▁<EOT>']


In [11]:
def generate_model_completions(corpus):
    """
    Generate model completions for Fill-In-Middle taks
    """
    model_completions = []

    for code in tqdm(corpus):	
        prompt = f"{prefix_token} {code['prefix']} {suffix_token} {code['suffix']} {middle_token}"
        
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        desired_tokens_size = len(inputs["input_ids"][0]) + len(tokenizer.encode(code["middle"], return_tensors="pt")[0])
           
        outputs = model.generate(**inputs, max_length=desired_tokens_size)
        model_completions.append(tokenizer.decode(outputs[0], skip_special_tokens=False))
	
    return model_completions

In [None]:
model_completions = generate_model_completions(corpus)

In [13]:
expected_completions = [code["middle"] for code in corpus]

pairs = [{"model_completion": completion.split(middle_token[1:])[-1], "expected_completion": code} for completion, code in zip(
    model_completions, expected_completions)]

with open("pairs.json", "w", encoding="utf-8") as f:
    json.dump(pairs, f, ensure_ascii=False, indent=4)

The pairs.json file contains the model completions and the expected completions for the Fill-In-Middle task. I evaluated each model response by score from 0 to 5, where 0 is the worst score and 5 is the best score, and added this score to the pairs.json file.

In [5]:
from metrics import bleu_score, exact_match, edit_similarity, chrf_score

my_judgement_scores = []
bleu_scores = []
exact_match_scores = []
edit_similarity_scores = []
chrf_scores = []

with open("pairs.json", "r", encoding="utf-8") as f:
    pairs = json.load(f)

In [6]:

for pair in tqdm(pairs):
    expected_completion = pair["expected_completion"]
    model_completion = pair["model_completion"]

    bleu_scores.append(bleu_score(expected_completion, model_completion))
    exact_match_scores.append(exact_match(expected_completion, model_completion)["exact_match"])
    edit_similarity_scores.append(edit_similarity(expected_completion, model_completion))
    chrf_scores.append(chrf_score(expected_completion, model_completion)["score"])
    my_judgement_scores.append(pair["score"])

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 48/48 [02:07<00:00,  2.66s/it]


In [7]:
my_judgement_scores = np.array(my_judgement_scores)
bleu_scores = np.array(bleu_scores)
exact_match_scores = np.array(exact_match_scores)
edit_similarity_scores = np.array(edit_similarity_scores)

print("exact_match_scores", exact_match_scores)

print(f"Correlation between my judgement and BLEU score: {np.corrcoef(my_judgement_scores, bleu_scores)[0, 1]}")
print(f"Correlation between my judgement and Exact Match: {np.corrcoef(my_judgement_scores, exact_match_scores)[0, 1]}")
print(f"Correlation between my judgement and Edit Similarity: {np.corrcoef(my_judgement_scores, edit_similarity_scores)[0, 1]}")
print(f"Correlation between my judgement and CHRF score: {np.corrcoef(my_judgement_scores, chrf_scores)[0, 1]}")

exact_match_scores [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Correlation between my judgement and BLEU score: 0.5823924405943129
Correlation between my judgement and Exact Match: nan
Correlation between my judgement and Edit Similarity: 0.7643977832029185
Correlation between my judgement and CHRF score: 0.8524125158512138


  c /= stddev[:, None]
  c /= stddev[None, :]


Of course, the correlation between anything and vector of zeros is not defined (resulting in nan). Exact match score is 0 for all pairs, because even when the model generates in its answer the same code as the expected completion, there was a difference by a new line symbol in the start.

In [8]:
for pair in pairs:
    pair["bleu_score"] = bleu_scores[pairs.index(pair)]
    pair["exact_match_score"] = exact_match_scores[pairs.index(pair)]
    pair["edit_similarity_score"] = edit_similarity_scores[pairs.index(pair)]
    pair["chrf_score"] = chrf_scores[pairs.index(pair)]

with open("resulting_dataset.json", "w", encoding="utf-8") as f:
    json.dump(pairs, f, ensure_ascii=False, indent=4)