In [None]:
%pip install -qqq torch --progress-bar off
%pip install -qqq transformers --progress-bar off
%pip install -qqq datasets --progress-bar off
%pip install -qqq accelerate==0.34.2--progress-bar off
%pip install -qqq bitsandbytes --progress-bar off
%pip install -qqq peft --progress-bar off
%pip install -qqq trl --progress-bar off
%pip install -qqq colored --progress-bar off
%pip install -qqq huggingface_hub --progress-bar off
%pip install -qqq seaborn --progress-bar off

In [1]:
import random
from textwrap import dedent
from typing import Dict, List

In [2]:
import matplotlib as mpl
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from colored import Back, Fore, Style
from datasets import Dataset, load_dataset
from matplotlib.ticker import PercentFormatter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)

In [4]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer

In [5]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

COLORS = ["#bae1ff", "#ffb3ba", "#ffdfba", "#ffffba", "#baffc9"]

sns.set(style="whitegrid", palette="muted", font_scale=1.2)
sns.set_palette(sns.color_palette(COLORS))

cmap = colors.LinearSegmentedColormap.from_list("custom_cmap", COLORS[:2])

MY_STYLE = {
    "figure.facecolor": "black",
    "axes.facecolor": "black",
    "axes.edgecolor": "white",
    "axes.labelcolor": "white",
    "axes.linewidth": 0.5,
    "text.color": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "gray",
    "grid.linestyle": "--",
    "grid.linewidth": 0.5,
    "axes.grid": True,
    "xtick.labelsize": "medium",
    "ytick.labelsize": "medium",
    "axes.titlesize": "large",
    "axes.labelsize": "large",
    "lines.color": COLORS[0],
    "patch.edgecolor": "white",
}

mpl.rcParams.update(MY_STYLE)

SEED = 42

In [6]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [7]:
seed_everything(SEED)
PAD_TOKEN = "<|pad|>"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
NEW_MODEL = "Llama-3-8B-Finetuning"

In [8]:
from huggingface_hub import login, whoami

# Paste your token here
login_token = "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"

# Login to Hugging Face
login(login_token)

user_info = whoami()
print(f"Logged in as: {user_info['name']}")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful
Logged in as: priyadharshiniResolve


In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    #     attn_implementation="flash_attention_2",
    #     attn_implementation="sdpa",
    device_map="auto",
)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)


Loading checkpoint shards: 100%|██████████| 4/4 [02:20<00:00, 35.06s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128264, 4096)

In [10]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.

In [11]:
tokenizer.bos_token, tokenizer.bos_token_id

('<|begin_of_text|>', 128000)

In [12]:
# %%
tokenizer.eos_token, tokenizer.eos_token_id

('<|eot_id|>', 128009)

In [13]:
# %%
tokenizer.pad_token, tokenizer.pad_token_id

('<|pad|>', 128256)

In [14]:
# %%
tokenizer.convert_tokens_to_ids(PAD_TOKEN)

128256

In [43]:
from datasets import load_dataset, DatasetDict 
import pandas as pd

In [44]:
# %%
# Load SQuAD dataset
dataset = load_dataset("squad")

# Split the original training dataset into 90% train and 10% validation
train_valid_split = dataset["train"].train_test_split(test_size=0.1, seed=42)

# Rename the split keys for clarity
new_dataset = DatasetDict({
    "train": train_valid_split["train"],  # 90% of original train set
    "validation": train_valid_split["test"],  # 10% of original train set
    "test": dataset["validation"]  # Original validation set becomes test set
}) 

# %%
new_dataset['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 78839
})

In [45]:
df_train = new_dataset['train'].to_pandas()
df_train = pd.DataFrame(df_train)
df_train['answers'] = df_train['answers'].apply(lambda x: x['text'][0] if isinstance(x, dict) and 'text' in x else x)
df_train = df_train.drop(columns=['id','title'])


In [46]:
df_valid = new_dataset['validation'].to_pandas()
df_valid = pd.DataFrame(df_valid)
df_valid['answers'] = df_valid['answers'].apply(lambda x: x['text'][0] if isinstance(x, dict) and 'text' in x else x)
df_valid = df_valid.drop(columns=['id','title'])


In [47]:
df_test = new_dataset['test'].to_pandas()
df_test = pd.DataFrame(df_test)
df_test['answers'] = df_test['answers'].apply(lambda x: x['text'][0] if isinstance(x, dict) and 'text' in x else x)
df_test = df_test.drop(columns=['id','title'])


In [48]:
print("train dataset shape:",df_train.shape)
print("Valid dataset shape:",df_valid.shape)

train dataset shape: (78839, 3)
Valid dataset shape: (8760, 3)


In [49]:
df_train.isnull().value_counts()

# %%
df_valid.isnull().value_counts()

context  question  answers
False    False     False      8760
Name: count, dtype: int64

In [50]:
def format_example(row: dict):
    prompt = dedent(
        f"""
    {row["question"]}

    Information:

    ```
    {row["context"]}
    ```
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Use only the information to answer the question",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answers"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

df_train["text"] = df_train.apply(format_example, axis=1)

In [51]:
def count_tokens(row: Dict) -> int:
    return len(
        tokenizer(
            row["text"],
            add_special_tokens=True,
            return_attention_mask=False,
        )["input_ids"]
    )
df_train["token_count"] = df_train.apply(count_tokens, axis=1)

In [52]:
def format_example(row: dict):
    prompt = dedent(
        f"""
    {row["question"]}

    Information:

    ```
    {row["context"]}
    ```
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Use only the information to answer the question",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answers"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

df_valid["text"] = df_valid.apply(format_example, axis=1)

In [53]:
def count_tokens(row: Dict) -> int:
    return len(
        tokenizer(
            row["text"],
            add_special_tokens=True,
            return_attention_mask=False,
        )["input_ids"]
    )
df_valid["token_count"] = df_valid.apply(count_tokens, axis=1)

In [54]:
df_train.to_json("train.json", orient="records", lines=True)
df_valid.to_json("val.json", orient="records", lines=True)

# %%
df_test.to_json("test.json", orient="records", lines=True)

In [55]:
test_dataset = load_dataset(
    "json",
    data_files={"test": "test.json"},
)

Generating test split: 10570 examples [00:00, 108718.02 examples/s]


In [56]:
dataset = load_dataset(
    "json",
    data_files={"train": "train.json", "validation": "val.json"},
)

Generating train split: 78839 examples [00:01, 49001.67 examples/s]
Generating validation split: 8760 examples [00:00, 48090.06 examples/s]


In [57]:
dataset

# %%
print(dataset["train"][0]["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Use only the information to answer the question<|eot_id|><|start_header_id|>user<|end_header_id|>

To show their strength in the international Communist movement, what did China do?

Information:

```
After the formation of the People's Republic of China in 1949, the Chinese government named the Western nations, led by the United States, as the biggest threat to its national security. Basing this judgment on China's century of humiliation beginning in the early 19th century, American support for the Nationalists during the Chinese Civil War, and the ideological struggles between revolutionaries and reactionaries, the Chinese leadership believed that China would become a critical battleground in the United States' crusade against Communism. As a countermeasure and to elevate China's standing among the worldwide Communist movements, the Chinese leadership adopted a 

In [58]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    return_full_text=False,
)

In [59]:
def create_test_prompt(data_row):
    prompt = dedent(
        f"""
    {data_row["question"]}

    Information:

    ```
    {data_row["context"]}
    ```
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Use only the information to answer the question",
        },
        {"role": "user", "content": prompt},
    ]
    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

# %%
row = dataset["validation"][0]
prompt = create_test_prompt(row)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Use only the information to answer the question<|eot_id|><|start_header_id|>user<|end_header_id|>

What percentage of Egyptians polled support death penalty for those leaving Islam?

Information:

```
The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.
`

In [33]:
%%time
outputs = pipe(prompt)
response = f"""
answer:     {row["answers"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)



answer:     84%
prediction: 84% of Egyptians polled supported the death penalty for those who leave Islam.

CPU times: user 1.85 s, sys: 356 ms, total: 2.21 s
Wall time: 4.37 s


In [34]:
row = dataset["validation"][1]
prompt = create_test_prompt(row)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Use only the information to answer the question<|eot_id|><|start_header_id|>user<|end_header_id|>

Ann Arbor ranks 1st among what goods sold?

Information:

```
The Ann Arbor Hands-On Museum is located in a renovated and expanded historic downtown fire station. Multiple art galleries exist in the city, notably in the downtown area and around the University of Michigan campus. Aside from a large restaurant scene in the Main Street, South State Street, and South University Avenue areas, Ann Arbor ranks first among U.S. cities in the number of booksellers and books sold per capita. The Ann Arbor District Library maintains four branch outlets in addition to its main downtown building. The city is also home to the Gerald R. Ford Presidential Library.
```<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [35]:
%%time
outputs = pipe(prompt)
response = f"""
answer:     {row["answers"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)


answer:     books
prediction: Ann Arbor ranks 1st among U.S. cities in the number of booksellers and books sold per capita.

CPU times: user 1.3 s, sys: 16.2 ms, total: 1.31 s
Wall time: 1.31 s


In [36]:
rows = []
for row in tqdm(dataset["validation"]):
    prompt = create_test_prompt(row)
    outputs = pipe(prompt)
    rows.append(
        {
            "question": row["question"],
            "context": row["context"],
            "prompt": prompt,
            "answer": row["answer"],
            "untrained_prediction": outputs[0]["generated_text"],
        }
    )

predictions_df = pd.DataFrame(rows)

  0%|          | 0/8760 [00:00<?, ?it/s]


KeyError: 'answer'

In [37]:
len(tokenizer)

# %%
response_template = "<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

examples = [dataset["train"][0]["text"]]
encodings = [tokenizer(e) for e in examples]

dataloader = DataLoader(encodings, collate_fn=collator, batch_size=1)

In [38]:
batch = next(iter(dataloader))
batch.keys()

# %%
batch["labels"]

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -

In [39]:
model

# %%
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# %%
model.print_trainable_parameters()

trainable params: 83,886,080 || all params: 8,114,212,864 || trainable%: 1.0338


In [59]:
OUTPUT_DIR = "experiments"

# %%
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    dataset_text_field="text",
    max_seq_length=512,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    eval_strategy="steps",
    eval_steps=0.2,
    save_steps=0.2,
    logging_steps=10,
    learning_rate=1e-4,
    fp16=True,  # or bf16=True,
    save_strategy="steps",
    warmup_ratio=0.1,
    save_total_limit=2,
    lr_scheduler_type="constant",
    report_to="none",
    save_safetensors=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False,  # No need to add additional separator token
    },
    seed=SEED,
)



In [60]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
)


In [47]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [48]:
!nvidia-smi

Wed Dec 11 19:50:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.01             Driver Version: 535.216.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   34C    P0              74W / 300W |    513MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          On  | 00000002:00:0

In [50]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [57]:
from accelerate import Accelerator
accelerator = Accelerator()

In [62]:
# if last_checkpoint:
#     print(f"Resuming training from {last_checkpoint}...")
#     trainer.train(resume_from_checkpoint=last_checkpoint)
# else:
#     print("No checkpoint found; starting fresh training...")
trainer.train()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:1!

In [None]:
# Check if checkpoint exists
checkpoint_path = "experiments/checkpoint-9855"
if os.path.exists(checkpoint_path):
    print("Checkpoint found. Loading checkpoint...")
    trainer.train(resume_from_checkpoint=checkpoint_path)

In [None]:
SAVE_MODEL_DIR = "CUSTOM-MODEL"
# Save Model
trainer.save_model(SAVE_MODEL_DIR)

# Explicitly ensure config is saved
model.config.save_pretrained(SAVE_MODEL_DIR)

# %%
tokenizer.save_pretrained(NEW_MODEL)

# %%
print(type(model))  # Should be the model class (e.g., AutoModelForSequenceClassification)
print(type(trainer))
print(hasattr(model, "train"))  # This should return True



In [None]:
model.push_to_hub(NEW_MODEL, tokenizer=tokenizer, max_shard_size="5GB")

# %%
tokenizer.push_to_hub(NEW_MODEL)

In [None]:
dataset = load_dataset(
    "json",
    data_files={"train": "train.json", "validation": "val.json"},
)
dataset

In [None]:
NEW_MODEL = "CUSTOM-MODEL"
tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL)
print("tokenizer loaded")
print("tokenizer length:",len(tokenizer))

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=quantization_config, device_map="auto"
)


# %%
len(tokenizer)


In [1]:
from transformers import AutoConfig
SAVED_MODEL = "/home/azureuser/cloudfiles/code/Llama-3-8B-Instruct-SQUAD-RAG"
CHECKPOINT_9855 = "/home/azureuser/cloudfiles/code/experiments/checkpoint-9855"
CHECKPOINT_7884 = "/home/azureuser/cloudfiles/code/experiments/checkpoint-7884"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
config_saved_model = AutoConfig.from_pretrained(CHECKPOINT_7884)
print(f"Vocabulary size in the saved model: {config_saved_model.vocab_size}")

# %%
len(tokenizer)

# %%
# Resize the model embeddings to match the tokenizer length
model.resize_token_embeddings(len(tokenizer))

# %%
config_checkpoint_model = AutoConfig.from_pretrained(CHECKPOINT_9855)
print(f"Vocabulary size in the checkpoint model: {config_checkpoint_model.vocab_size}")

# %%
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

# %%
print(new_vocab_size)
print("Model Name:",MODEL_NAME)

# %%
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    return_full_text=False,
)


In [None]:

def create_test_prompt(data_row):
    prompt = dedent(
        f"""
    {data_row["question"]}

    Information:

    ```
    {data_row["context"]}
    ```
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Use only the information to answer the question",
        },
        {"role": "user", "content": prompt},
    ]
    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

# %%
row = test_dataset["test"][0]
prompt = create_test_prompt(row)
print(prompt)

# %%
def format_example(row: dict):
    prompt = dedent(
        f"""
    {row["question"]}

    Information:

    ```
    {row["context"]}
    ```
    """
    )
    messages = [
        {
            "role": "system",
            "content": "Use only the information to answer the question",
        },
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row["answers"]},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

df_test["text"] = df_test.apply(format_example, axis=1)

def count_tokens(row: Dict) -> int:
    return len(
        tokenizer(
            row["text"],
            add_special_tokens=True,
            return_attention_mask=False,
        )["input_ids"]
    )
df_test["token_count"] = df_test.apply(count_tokens, axis=1)

# %%
%%time
outputs = pipe(prompt)
print("outputs:",outputs)
response = f"""
answer:     {row["answers"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)

# %%
%%time
outputs = pipe(prompt)
response = f"""
answer:     {row["answer"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)

# %%
from accelerate import Accelerator
import torch

# Function to process data using accelerate
def process_data_on_gpus(data):
    # Initialize the accelerator to handle multi-GPU setup
    accelerator = Accelerator()
    data = accelerator.gather(data)
    prompts = [create_test_prompt(row) for row in data]
    return prompts

# Use the accelerator to run across multiple GPUs
prompts = process_data_on_gpus(test_dataset["test"])

# Print the first 10 processed prompts (as an example)
for prompt in prompts[:10]:
    print(prompt)


# %%
row = test_dataset["test"][1]
prompt = create_test_prompt(row)
print(prompt)

# %%
%pip install evaluate

# %%
len(prompts)

# %%
%pip install sacrebleu


# %%
%pip install nltk

# %%
import nltk
from nltk.translate.bleu_score import sentence_bleu

# %%
from accelerate import Accelerator
import time
import evaluate
from tqdm import tqdm  # Importing tqdm for progress bar


# Example function to process each prompt and calculate F1 score
def generate_responses_and_calculate_f1(prompts, test_dataset, pipe):
    accelerator = Accelerator()  # Initialize accelerator for multi-GPU handling
    
    responses = []
    true_answers = []
    predicted_answers = []

    # Using tqdm to add a progress bar to the loop
    for i, prompt in tqdm(enumerate(prompts[:5]), total=100, desc="Processing Prompts"):
        # Generate output using the model pipeline
        outputs = pipe(prompt)

        # Get the generated text (prediction) from the model output
        prediction = outputs[0]["generated_text"]

        # Get the ground truth answer
        answer = test_dataset["test"][i]["answers"]
        # Store the true answer and predicted answer
        true_answers.append(answer)
        predicted_answers.append(prediction)

    print(true_answers)
    print(predicted_answers)

# Run the function and calculate F1 score
generate_responses_and_calculate_f1(prompts, test_dataset, pipe)


# %%
predicted_answers = ['Denver Broncos', 'Carolina Panthers', 'Santa Clara, California', 'Denver Broncos', 'gold']

true_answers = ['Denver Broncos', 'Carolina Panthers', "Levi's Stadium", 'Denver Broncos', 'gold']

# %%
# Compute BLEU score
bleu_score = sentence_bleu([true_answers],predicted_answers)
print("BLEU Score:", bleu_score)

# %%
# Print the first 5 responses and the F1 score
for response in responses[:5]:
    print(response)


# %%
from accelerate import Accelerator
import time

# Assuming 'pipe' is your pipeline and 'test_dataset' contains the rows with answers.
# Example function to process each prompt with the model pipeline and generate response
def generate_responses(prompts, test_dataset, pipe):
    accelerator = Accelerator()  # Initialize accelerator for GPU distribution
    
    responses = []

    # Process each prompt in parallel using Accelerator
    for i, prompt in enumerate(prompts):
        # Simulate a model inference call using pipe(prompt)
        # Assuming that 'pipe' processes the prompt and returns outputs
        outputs = pipe(prompt)

        # Format the response with the answer and the generated prediction
        response = f"""
        answer:     {test_dataset["test"][i]["answers"]}
        prediction: {outputs[0]["generated_text"]}
        """
        
        responses.append(response)

    return responses

# Example of gathering all prompts and running them through the pipeline
prompts = [create_test_prompt(row) for row in test_dataset["test"]]  # Create prompts for each row

# Use %time to measure the execution time of the function
start_time = time.time()

# Run the model inference across all prompts and generate responses
responses = generate_responses(prompts, test_dataset, pipe)

# Print the first 5 responses as an example
for response in responses[:5]:
    print(response)

# Measure and print the execution time
end_time = time.time()
print(f"Processing time: {end_time - start_time:.2f} seconds")


# %%
%%time
outputs = pipe(prompt)
response = f"""
answer:     {row["answers"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)

# %%
row = dataset["test"][2]
prompt = create_test_prompt(row)
print(prompt)

# %%
%%time
outputs = pipe(prompt)
response = f"""
answer:     {row["answer"]}
prediction: {outputs[0]["generated_text"]}
"""
print(response)

# %%
predictions = []
for row in tqdm(dataset["test"]):
    outputs = pipe(create_test_prompt(row))
    predictions.append(outputs[0]["generated_text"])

# %%
predictions_df.head()

# %%
predictions_df["trained_prediction"] = predictions

# %%
predictions_df.head()

# %%
predictions_df.to_csv("predictions.csv", index=None)

# %%
sample = predictions_df.sample(n=20)
sample.head()

# %%
for i, row in sample.head(n=10).reset_index().iterrows():
    print(f"{Fore.DARK_VIOLET_1A}{Back.WHITE}Example {i + 1}{Style.reset}")
    response = f"""
{Fore.BLUE}answer:{Style.reset} {row['answer']}

{Fore.GREEN}trained:{Style.reset} {row['trained_prediction']}

{Fore.DARK_ORANGE}untrained:{Style.reset} {row['untrained_prediction']}
"""
    print(response)


