In [7]:
!pip install rouge-score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a4f40eecb0b74fc391378b60259ddf225dc0fdad668217c90dfd7433ab2fe58f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [28]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm

import transformers
import torch
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer


In [5]:

splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df_train = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)
df_test = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["test"], lines=True)

def extract_prompt_and_reference(row):
    parts = row.split("### Assistant:")
    # Human prompt AND Assistant response is our output target
    prompt = parts[0].strip()  
    reference = parts[1].strip() if len(parts) > 1 else "" 
    return prompt, reference

df_train[["prompt", "reference"]] = df_train["text"].apply(lambda x: pd.Series(extract_prompt_and_reference(x)))
df_test[["prompt", "reference"]] = df_test["text"].apply(lambda x: pd.Series(extract_prompt_and_reference(x)))

# Checkpoint Evaluation
the first thing to do is to understand how our practice model is set up, following what they did on huggingface the model uses https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.1 as a base, so before finetuning we want to understand how the model performs

In [18]:
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device=0 if device == "cuda" else -1,  # GPU: device=0, CPU: device=-1
)

prompt = "which anime is the most important one"
formatted_prompt = f"### Human: {prompt} ### Assistant:"
sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=500,
)
for seq in sequences:
    print(seq["generated_text"])

cuda


Both `max_new_tokens` (=32) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


### Human: which anime is the most important one ### Assistant: The Dragon Ball Z series.### Speaker 2: That's a tough question, there are so many great ones out there!
In


In [29]:
dataset = Dataset.from_pandas(df_test)


def generate_response(example):
    # Formatta il prompt
    formatted_prompt = f"### Human: {example['prompt']} ### Assistant:"
    
    # Calcola la lunghezza del prompt in token
    input_ids = tokenizer(formatted_prompt, return_tensors="pt")["input_ids"]
    prompt_length = input_ids.shape[1]  # Numero di token nel prompt
    
    # Imposta max_length come la lunghezza del prompt + margine per la generazione
    max_length = prompt_length + 100  # Aggiungi un margine di 100 token per la generazione
    
    # Genera la risposta
    sequences = pipeline(
        formatted_prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        repetition_penalty=1.5,
        eos_token_id=tokenizer.eos_token_id,
        truncation=True,
        max_length=max_length,  # Lunghezza dinamica
        max_new_tokens=None  # Imposta esplicitamente a None per evitare conflitti
    )
    
    # Pulisci il testo generato
    generated_text = sequences[0]["generated_text"] if sequences else ""
    parts = generated_text.split("### Assistant:")
    cleaned_response = parts[1].strip() if len(parts) > 1 else ""
    return {"generated": cleaned_response}

dataset = dataset.map(generate_response, batched=False)


# Funzione per calcolare BLEU
def calculate_bleu(reference, candidate):
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    # Usa SmoothingFunction per evitare warning
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing_function)


# Funzione per calcolare ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def calculate_rouge_score(example):
    scores = scorer.score(example['reference'], example['generated'])
    return {
        "rouge1": scores["rouge1"].fmeasure,
        "rouge2": scores["rouge2"].fmeasure,
        "rougeL": scores["rougeL"].fmeasure,
    }
# Calcola BLEU
dataset = dataset.map(calculate_bleu_score)

# Calcola ROUGE
dataset = dataset.map(calculate_rouge_score)

average_bleu = sum(dataset["bleu"]) / len(dataset)
average_rouge1 = sum(dataset["rouge1"]) / len(dataset)
average_rouge2 = sum(dataset["rouge2"]) / len(dataset)
average_rougeL = sum(dataset["rougeL"]) / len(dataset)

print("Average BLEU:", average_bleu)
print("Average ROUGE-1:", average_rouge1)
print("Average ROUGE-2:", average_rouge2)
print("Average ROUGE-L:", average_rougeL)


Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Average BLEU: 0.1623378038711583
Average ROUGE-1: 0.15047413918509633
Average ROUGE-2: 0.016966875004574115
Average ROUGE-L: 0.09186045851931779


# Baseline 

In [None]:
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device=0 if device == "cuda" else -1,  # GPU: device=0, CPU: device=-1
)


prompt = "What do you think of Pokemon?"
formatted_prompt = (
    f"### Human: {prompt}### Assistant:"
)


sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    top_k=50,
    top_p = 0.7,
    num_return_sequences=1,
    repetition_penalty=1.1,
    max_new_tokens=500,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")