In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install xformers trl peft bitsandbytes evaluate wandb
!pip install -U accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
!pip install bert_score rouge_score sacrebleu

In [1]:
import os

import evaluate
import pandas as pd
import transformers
import wandb
from tqdm import tqdm
from unsloth import FastLanguageModel

2024-04-28 10:55:00.065214: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 10:55:00.065362: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 10:55:00.187071: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
params = {
    "model": "unsloth/llama-3-8b-bnb-4bit",
    "max_new_tokens": 2048,
    "max_seq_length": 8192,
    "temperature": 0.3,
    "top_p": 0.9,
    "random_seed": 42,
}

In [None]:
!wandb login TOKEN

In [3]:
def load_model(model_name):
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name, max_seq_length=params["max_seq_length"], dtype=None, load_in_4bit=True
    )
    return model, tokenizer

In [4]:
model, tokenizer = load_model(params["model"])

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.895 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 6.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
wandb.init(
    project="PRGen",
    name=f"Experiment {params['model']} benchmark",
    config=params,
)

[34m[1mwandb[0m: Currently logged in as: [33msamoed-roman[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
df = None
path = "/kaggle/input/preprocesseddataset/merged_prs_context_size_6144"
for file in tqdm(os.listdir(path)):
    if not file.endswith(".parquet"):
        continue
    if df is None:
        df = pd.read_parquet(f"{path}/{file}")
    else:
        df = pd.concat([df, pd.read_parquet(f"{path}/{file}")], ignore_index=True)
df = df.reset_index(drop=True)
df.head()

100%|██████████| 85/85 [00:59<00:00,  1.44it/s]


Unnamed: 0,title,diff,body,url,created_at,closed_at,merged_at,updated_at
0,Add acme library usage example (http-01),diff --git a/acme/examples/http01_example.py b...,Here is a usage example of the ACME API explor...,https://api.github.com/repos/certbot/certbot/p...,2018-01-26T08:48:31Z,2019-02-23T02:02:43Z,2019-02-23T02:02:43Z,2022-09-10T18:42:56Z
1,Land release changes,diff --git a/acme/setup.py b/acme/setup.py\nin...,This PR should be merged and not squashed to k...,https://api.github.com/repos/certbot/certbot/p...,2018-01-25T21:51:18Z,2018-01-25T22:20:44Z,2018-01-25T22:20:44Z,2018-01-25T22:21:13Z
2,Land release changes,diff --git a/letsencrypt-auto b/letsencrypt-au...,This PR should be merged and not squashed to k...,https://api.github.com/repos/certbot/certbot/p...,2018-01-25T21:51:18Z,2018-01-25T22:20:44Z,2018-01-25T22:20:44Z,2018-01-25T22:21:13Z
3,Update certbot-auto and help,diff --git a/certbot-auto b/certbot-auto\ninde...,,https://api.github.com/repos/certbot/certbot/p...,2018-01-25T21:50:42Z,2018-01-25T23:29:39Z,2018-01-25T23:29:39Z,2018-01-25T23:29:42Z
4,Update certbot-auto and help,diff --git a/letsencrypt-auto-source/letsencry...,,https://api.github.com/repos/certbot/certbot/p...,2018-01-25T21:50:42Z,2018-01-25T23:29:39Z,2018-01-25T23:29:39Z,2018-01-25T23:29:42Z


In [14]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [8]:
system_prompt = "<|start_header_id|>You a helpful assistant that generates a text description of a pull request. Based on the diff of pull request, generate a text description of the pull request. Your task is to provide a concise summary of the changes. This summary will be used as description of pull request.<|end_header_id|><|eot_id|>"
user_prompt = "<|start_header_id|>{}<|end_header_id|><|eot_id|>"

In [9]:
df["body"] = df["body"].fillna("")

In [10]:
prompts = (
    df.sample(frac=0.1, random_state=params["random_seed"])
    .apply(
        lambda x: {
            "prompt": system_prompt + user_prompt.format(x["diff"]),
            "completion": x["title"] + "\n" + x.get("body", ""),
        },
        axis=1,
    )
    .tolist()
)

In [13]:
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

In [15]:
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")
chrf = evaluate.load("chrf")

In [16]:
def compute_metrics(eval_pred: tuple[list[str], list[str]]):
    predictions, labels = eval_pred

    rouge_score = rouge.compute(predictions=predictions, references=labels)
    bert_score = bertscore.compute(predictions=predictions, references=[[l] for l in labels], lang="en")

    chrf_score = chrf.compute(predictions=predictions, references=labels)

    processed_bert_score = {}
    for key, value in bert_score.items():
        if key == "hashcode":
            processed_bert_score["bert_hashcode"] = value
            continue
        for i, v in enumerate(value):
            processed_bert_score[f"bert_{key}_{i}"] = v

    return rouge_score | processed_bert_score | chrf_score

In [17]:
generated = []
target = []
for prompt in tqdm(prompts):
    try:
        outputs = pipeline(
            prompt["prompt"],
            max_new_tokens=params["max_new_tokens"],
            eos_token_id=terminators,
            do_sample=True,
            temperature=params["temperature"],
            top_p=params["top_p"],
            return_full_text=False,
        )
        generated.append(outputs[0]["generated_text"])
        target.append(prompt["completion"])
    except Exception as e:
        print(e)

 20%|██        | 2/10 [03:38<14:33, 109.23s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (16158 > 8192). Running this sequence through the model will result in indexing errors
Unsloth: Input IDs of length 16158 > the model's max sequence length of 8192.
We shall truncate it ourselves. It's imperative if you correct this issue first.
 20%|██        | 2/10 [03:38<14:35, 109.47s/it]


RuntimeError: The size of tensor a (8192) must match the size of tensor b (16158) at non-singleton dimension 1

In [24]:
wandb.log(compute_metrics((generated, [p["completion"] for p in prompts])))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
