In [1]:
from transformers import AutoTokenizer
import transformers
import torch
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'rouge_score'

In [8]:

splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df_train = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)
df_test = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["test"], lines=True)

def extract_prompt_and_reference(row):
    parts = row.split("### Assistant:")
    # Human prompt AND Assistant response is our output target
    prompt = parts[0].strip()  
    reference = parts[1].strip() if len(parts) > 1 else "" 
    return prompt, reference

df_train[["prompt", "reference"]] = df_train["text"].apply(lambda x: pd.Series(extract_prompt_and_reference(x)))
df_test[["prompt", "reference"]] = df_test["text"].apply(lambda x: pd.Series(extract_prompt_and_reference(x)))

# Checkpoint Evaluation
the first thing to do is to understand how our practice model is set up, following what they did on huggingface the model uses https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.1 as a base, so before finetuning we want to understand how the model performs

In [None]:
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device=0 if device == "cuda" else -1,  # GPU: device=0, CPU: device=-1
)

prompt = "which anime is the most important one"
formatted_prompt = f"### Human: {prompt} ### Assistant:"
sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=500,
)
for seq in sequences:
    print(seq["generated_text"])

In [None]:
from tqdm import tqdm

generated_responses = []
for prompt in tqdm(df_test["prompt"], desc="Generating responses"):
    formatted_prompt = f"### Human: {prompt} ### Assistant:"
    sequences = pipeline(
        formatted_prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        repetition_penalty=1.5,
        eos_token_id=tokenizer.eos_token_id,
        truncation=True,
        max_length=500,
    )
    generated_responses.append(sequences[0]["generated_text"] if sequences else "")
    
def clean_generated_response(response):
    parts = response.split("### Assistant:")
    return parts[1].strip() if len(parts) > 1 else ""

df_test["generated"] = [clean_generated_response(resp) for resp in generated_responses]



def calculate_bleu(reference, candidate):
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    return sentence_bleu([reference_tokens], candidate_tokens)

df_test["bleu"] = df_test.apply(lambda row: calculate_bleu(row["reference"], row["generated"]), axis=1)
print("Average BLEU score:", df_test["bleu"].mean())


scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def calculate_rouge(reference, candidate):
    scores = scorer.score(reference, candidate)
    return scores["rouge1"].fmeasure, scores["rouge2"].fmeasure, scores["rougeL"].fmeasure

df_test[["rouge1", "rouge2", "rougeL"]] = df_test.apply(
    lambda row: pd.Series(calculate_rouge(row["reference"], row["generated"])), axis=1
)
print("Average ROUGE-1:", df_test["rouge1"].mean())
print("Average ROUGE-2:", df_test["rouge2"].mean())
print("Average ROUGE-L:", df_test["rougeL"].mean())



# Baseline 

In [None]:
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device=0 if device == "cuda" else -1,  # GPU: device=0, CPU: device=-1
)


prompt = "What do you think of Pokemon?"
formatted_prompt = (
    f"### Human: {prompt}### Assistant:"
)


sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    top_k=50,
    top_p = 0.7,
    num_return_sequences=1,
    repetition_penalty=1.1,
    max_new_tokens=500,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")