**Installation of necessary modules and libraries**

Upload a new txt file and paste the following modules

accelerate == 0.31.0

bitsandbytes == 0.43.1

transformers == 4.42.3

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import torch
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline)
from sklearn.metrics import f1_score
!pip install rouge-score
from rouge_score import rouge_scorer
#Import the dataset
data = pd.read_json("test.json")

**Get your own access to the gated LLM models from huggingface or from the META website**

In [None]:
HF_TOKEN = #paste your own huggingface token

In [None]:
#select the model you wish to work on
model_name = "meta-llama/Meta-Llama-3-8B"

In [None]:
#setting up the quantization params
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=HF_TOKEN)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN
)

In [None]:
#Check the pipeline documentation from hugging face to learn about the parameters
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.01,
    top_p=0.95,
    repetition_penalty=1.15,
    top_k = 1,
    #echo = False
)

In [None]:
test1 = data.iloc[0]

prompt_template = f'''SYSTEM: You are a helpful, respectful and honest assistant. Always give factually coherent answers. Keep your answers to be brief within 3 sentences.

USER: {test1['question']}

CONTEXT: {test1['context']}

ASSISTANT:
'''
response = generator(prompt_template)

In [None]:
generated_text = response[0]["generated_text"]
if "ASSISTANT:" in generated_text:
    assistant_response = generated_text.split("ASSISTANT:")[1].strip().split("\n")[0].strip()
else:
    assistant_response = generated_text.strip()

print(assistant_response)

In [None]:
test2 = data.iloc[5]
prompt_template = f'''SYSTEM: You are a helpful, respectful and honest assistant. Interpret the context given carefully.  Always give factually coherent answers. Always give an answer.
Try to be mathematically correct.
USER: {test2['question']}

CONTEXT: {test2['context']}

ASSISTANT:
'''
response = generator(prompt_template)

In [None]:
generated_text = response[0]["generated_text"]
if "ASSISTANT:" in generated_text:
    assistant_response = generated_text.split("ASSISTANT:")[1].split("USER:")[0].strip().replace('\n', ' ').strip().split("##")[0].strip().replace('\n', ' ')
else:
    assistant_response = generated_text.strip().replace('\n', ' ')

assistant_response

In [None]:
data_sample = data.head(30)

model_responses = []
for index, row in data_sample.iterrows():
    prompt_template = f'''SYSTEM: You are a helpful, respectful and honest assistant. Interpret the context given carefully.  Always give factually coherent answers. Always give an answer.

USER: {row['question']}

CONTEXT: {row['context']}

ASSISTANT:
'''
    response = generator(prompt_template)
    generated_text = response[0]["generated_text"]
    if "ASSISTANT:" in generated_text:
        assistant_response = generated_text.split("ASSISTANT:")[1].split("USER:")[0].strip().replace('\n', ' ').strip().split("##")[0].strip().replace('\n', ' ').strip().split("SYSTEM:")[0].strip().replace('\n', ' ')
    else:
        assistant_response = generated_text.strip().replace('\n', ' ')
    model_responses.append(assistant_response)

data_sample['model_response'] = model_responses

In [None]:
# Apply ROUGE scores
rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []
for true_answer_list, model_answer in zip(data_sample['answers'], data_sample['model_response']):
    # Assuming 'true_answer_list' is a list of strings, join them into a single string
    true_answer = ' '.join(true_answer_list)
    scores = rouge_scorer_instance.score(true_answer, model_answer)
    rouge_scores.append(scores)

data_sample['rouge_score'] = rouge_scores


In [None]:
# Calculate average scores
avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

print(f'Average ROUGE-1 F1 Score: {avg_rouge1}')
print(f'Average ROUGE-2 F1 Score: {avg_rouge2}')
print(f'Average ROUGE-L F1 Score: {avg_rougeL}')

In [None]:
# Apply ROUGE scores
rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []
for true_answer_list, model_answer in zip(data_sample['context'], data_sample['model_response']):
    # Assuming 'true_answer_list' is a list of strings, join them into a single string
    true_answer = ' '.join(true_answer_list)
    scores = rouge_scorer_instance.score(true_answer, model_answer)
    rouge_scores.append(scores)

data_sample['rouge_score'] = rouge_scores


In [None]:
# Calculate average scores
avg_rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
avg_rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

print(f'Average ROUGE-1 F1 Score: {avg_rouge1}')
print(f'Average ROUGE-2 F1 Score: {avg_rouge2}')
print(f'Average ROUGE-L F1 Score: {avg_rougeL}')

In [None]:
for index, row in data_sample.iterrows():
    print(f"Question: {index+1}:")
    print(f"Question: {row['question']}")
    print(f"Context: {row['context']}")
    print(f"Model Response: {row['model_response']}")
    print(f"True Answers: {row['answers']}")

