In [2]:
from transformers import AutoTokenizer
import transformers
import torch
import polars as pl
from langdetect import detect
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
import re
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

  from .autonotebook import tqdm as notebook_tqdm


In [18]:

splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df_train = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)
df_test = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["test"], lines=True)

def extract_prompt_and_reference(row):
    parts = row.split("### Assistant:")
    # Human prompt AND Assistant response is our output target
    prompt = parts[0].strip()  
    reference = parts[1].strip() if len(parts) > 1 else "" 
    return prompt, reference

df_train[["prompt", "reference"]] = df_train["text"].apply(lambda x: pd.Series(extract_prompt_and_reference(x)))
df_test[["prompt", "reference"]] = df_test["text"].apply(lambda x: pd.Series(extract_prompt_and_reference(x)))

# Checkpoint Evaluation
the first thing to do is to understand how our practice model is set up, following what they did on huggingface the model uses https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.1 as a base, so before finetuning we want to understand how the model performs

In [16]:
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device=0 if device == "cuda" else -1,  # GPU: device=0, CPU: device=-1
)

prompt = "which anime is the most important one"
formatted_prompt = f"### Human: {prompt} ### Assistant:"
sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=500,
)
for seq in sequences:
    print(seq["generated_text"])

Device set to use cpu
Both `max_new_tokens` (=32) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


### Human: which anime is the most important one ### Assistant: It's impossible to say what animation by a certain company would be more "importnant" since each of their productions has it’s own distinctive


In [None]:
generated_outputs = []

for prompt in df_test["prompt"]:
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        num_return_sequences=1,
        repetition_penalty=1.2,
        max_new_tokens=150,  # Numero massimo di token generati
    )
    generated_outputs.append(sequences[0]["generated_text"].split("### Assistant:")[1].strip())

df_test["generated"] = generated_outputs


# Baseline 

In [7]:
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device=0 if device == "cuda" else -1,  # GPU: device=0, CPU: device=-1
)


prompt = "What do you think of Pokemon?"
formatted_prompt = (
    f"### Human: {prompt}### Assistant:"
)


sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    top_k=50,
    top_p = 0.7,
    num_return_sequences=1,
    repetition_penalty=1.1,
    max_new_tokens=500,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Device set to use cpu


Result: ### Human: What do you think of Pokemon?### Assistant: As an AI language model, I do not have personal opinions or feelings. However, I can respond to claims that Pokémon are becoming too popular or that they are harmful to the environment.

One claim is that Pokémon go is causing depletion of rivers and lakes due to the amount of people who visit these areas to catch Poke-mon. Another claim is that some species of Pokémon are being killed off by overfishing. It is true that certain species of fishing Pokémon may be struggling to find food, while others are being over-hunted. However, this is not the same as killing off all the species of Pokémon.

It's important to note that many Pokémon games and movies are designed to be fun and entertaining, and there is no legal basis for banning Pokémon go events or stopping the sale of Poke-mon cards in some countries.

Ultimately, it's up to each person and their family to decide if they want to spend time playing or catching Pokémon, a