In [1]:
!python --version

Python 3.10.12


In [None]:
!pip3 install torch fuzzywuzzy bleu python-Levenshtein accelerate transformers einops datasets peft bitsandbytes wandb

In [3]:
import torch
import wandb
import torch.cuda
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, TextStreamer, EarlyStoppingCallback
import pandas as pd
import os

In [4]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device('cuda')
print(device)

True
cuda


# Model implementation and testing

# Testing the original model

In [None]:
model_original = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto").to(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
wandb.login(key = "")

In [None]:
run_wandb = wandb.init(project='Code fine tuning', job_type="training", anonymous="allow")

In [None]:
model_original

In [22]:
prompts_list = [
    f"Write a sample program in Python",
    f"Write a program to sum two numbers in Python",
    f'''def print_prime(n):
        """
        Print all primes between 1 and n
        """''',
    f'''def close(self):
        """
        Close the socket.
        """''',
    f'''def _init_client(self, from_archive=False):
        """
        Init client
        """''',
    f'''def setUp(self):
        """
        Initialize the parser, required arguments
        """''',
    f"Write a sample program in Kotlin",
    f"Write a program to sum two numbers in Kotlin",
    f'''Complete the following function in Kotlin:
    fun isPrime(number: Int): Boolean''',
    f'''Complete the following function in Kotlin:
    fun sumTwoNumbers(number1: Int, number2: Int): Int''',
    # in training set
    f'''complete the following code in Kotlin
        val x = 1;
        val s = "";

        when (x) ???
        ???

        мal z = 1''',
    # in training set
    f'''
    Complete the following two fucntions in Kotlin:
    fun test4()

    fun test5()
    ''',
    # in training set
    f'''
    class KTypeProjectionTest
    ''',
]

In [None]:
#testing prompts:
for prompt in prompts_list:
    print(prompt + "\n")
    inputs = tokenizer([prompt], return_tensors="pt", return_attention_mask=False).to(device)

    outputs = model_original.generate(**inputs, max_length=150)
    text = tokenizer.batch_decode(outputs)[0]
    print(text + "\n")
    print("-"*10)
    for i in range(10):
      print("\n")

In [None]:
!git lfs clone https://huggingface.co/datasets/microsoft/codexglue_method_generation

In [19]:
python_test = pd.read_json(path_or_buf='/content/codexglue_method_generation/test.jsonl', lines=True)

In [None]:
python_test.head()

In [None]:
# downloading and testingevaluator (didn't prove useful after experimenting)
!mkdir /content/evaluator/
!wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Method-Generation/evaluator/bleu.py -P /content/evaluator/
!wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Method-Generation/evaluator/answers.txt -P /content/evaluator/
!wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Method-Generation/evaluator/evaluator.py -P /content/evaluator/
!wget https://github.com/microsoft/CodeXGLUE/raw/main/Code-Code/Method-Generation/evaluator/predictions.txt -P /content/evaluator/

In [None]:
!python /content/evaluator/evaluator.py -a=/content/evaluator/answers.txt -p=/content/evaluator/predictions.txt

In [36]:
del model_original

# Dataset parsing

In [None]:
!mkdir /content/kotlin_files/
!wget https://github.com/RoGellert/JBInternshipTask/raw/main/kotlin_files.zip
!unzip /content/kotlin_files.zip -d /content/kotlin_files/

In [8]:
from sklearn.model_selection import train_test_split
directory = '/content/kotlin_files'

files = []
dummy = []
for filename in os.listdir(directory):
  files.append(filename)

print(len(files))
dummy = [0] * len(files)

Files_train, Files_test, dummy_train, dummy_test = train_test_split(files, dummy, test_size=0.22, random_state=125)
print(len(Files_train))
print(len(Files_test))
print(Files_train[0])
print(Files_test[0])

54589
42579
12010
13341kt3087.kt
19374inlineClassPrimaryVal.kt


In [None]:
Files_train

In [9]:
kotlin_data = {"code": []}
for filename in Files_train:
    with open(directory+"/"+filename, 'r') as file:
      data = file.read().replace("\n", "<EOL>")
      i = 0
      add = 2048
      while i < len(data):
        kotlin_data["code"].append(data[i:i+add+1])
        i += add

In [10]:
len(kotlin_data["code"])

75317

In [11]:
kotlin_dataset = Dataset.from_dict(kotlin_data)

In [29]:
kotlin_dataset

Dataset({
    features: ['code'],
    num_rows: 75317
})

# Fine-tuning

In [None]:
fine_tuned_model = "fine_tuned_model"

In [13]:
bitsandbytes= BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5", device_map={"":0},
    quantization_config= bitsandbytes, trust_remote_code= True
)
tokenizer.pad_token = tokenizer.eos_token



config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [14]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2","Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config).to(device)
model.print_trainable_parameters()

trainable params: 7,864,320 || all params: 1,426,135,040 || trainable%: 0.5514428703750243


In [None]:
model

In [15]:
tokenized_training_data = kotlin_dataset.map(lambda x: tokenizer(x["code"], padding=True), batched=True)

Map:   0%|          | 0/75317 [00:00<?, ? examples/s]

In [None]:
tokenized_training_data

In [16]:
training_arguments = TrainingArguments(
        output_dir="output",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=30,
        max_steps=4000,
        num_train_epochs=2,
        report_to="wandb",
    )
trainer = Trainer(
    model=model,
    train_dataset=tokenized_training_data["input_ids"],
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

max_steps is given, it will override any value given in num_train_epochs


In [17]:
#Training
trainer.train()
trainer.model.save_pretrained(fine_tuned_model)
model.config.use_cache = True
#model.eval()

Step,Training Loss
30,1.5697
60,1.2359
90,1.2181
120,1.275
150,1.218
180,1.1901
210,1.1551
240,1.1989
270,1.2301
300,1.2145




NameError: name 'fine_tuned_model' is not defined

In [None]:
for prompt in prompts_list:
    print(prompt + "\n")
    inputs = tokenizer([prompt], return_tensors="pt", return_attention_mask=False).to(device)

    outputs = model.generate(**inputs, max_length=150)
    text = tokenizer.batch_decode(outputs)[0].replace("<EOL>", "\n")
    print(text + "\n")
    print("-"*10)
    for i in range(10):
      print("\n")

In [None]:
#from huggingface_hub import notebook_login

In [None]:
# !huggingface-cli login
# model.push_to_hub(hf_model_repo_before_merging, use_temp_dir=False)
# tokenizer.push_to_hub(hf_model_repo_before_merging, use_temp_dir=False)