In [None]:
# 1. INSTALL REQUIRED LIBRARIES
!pip install transformers accelerate bert-score pandas

# 2. IMPORT LIBRARIES
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from bert_score import score

# 3. LOAD DATASET (dataset.csv)
dataset_path = "dataset.csv"  # Make sure you've uploaded dataset.csv to Colab
df = pd.read_csv(dataset_path)

questions = df["Question"].tolist()
ground_truth_answers = df["Answer"].tolist()

# 4. CHECK FOR GPU
device = 0 if torch.cuda.is_available() else -1



In [None]:
# 5. LOAD MODEL 1: GPT-Neo-125M
#    - EleutherAI/gpt-neo-125M (~125M params, MIT License, no token needed)
print("Loading GPT-Neo-125M...")
model_1_name = "EleutherAI/gpt-neo-125M"
gpt_neo_125m_tokenizer = AutoTokenizer.from_pretrained(model_1_name)
gpt_neo_125m_model = AutoModelForCausalLM.from_pretrained(
    model_1_name,
    device_map="auto" if device == 0 else None
)
gpt_neo_125m_pipeline = pipeline(
    "text-generation",
    model=gpt_neo_125m_model,
    tokenizer=gpt_neo_125m_tokenizer,
    device=device,
    max_length=128,
    temperature=0.7,
    do_sample=True
)

Loading GPT-Neo-125M...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Device set to use cpu


In [None]:

# 6. LOAD MODEL 2: BLOOM-560m
#    - bigscience/bloom-560m (~560M params, open license, no token needed)
print("Loading BLOOM-560m...")
model_2_name = "bigscience/bloom-560m"
bloom_560m_tokenizer = AutoTokenizer.from_pretrained(model_2_name)
bloom_560m_model = AutoModelForCausalLM.from_pretrained(
    model_2_name,
    device_map="auto" if device == 0 else None
)
bloom_560m_pipeline = pipeline(
    "text-generation",
    model=bloom_560m_model,
    tokenizer=bloom_560m_tokenizer,
    device=device,
    max_length=128,
    temperature=0.7,
    do_sample=True
)

Loading BLOOM-560m...


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Device set to use cpu


In [None]:

# 7. HELPER FUNCTION TO GENERATE ANSWERS
def generate_answer(pipe, question):
    """
    Provide a minimal Q&A style prompt to these models (which are not instruction-tuned).
    """
    prompt = f"Question: {question}\nAnswer:"
    response = pipe(prompt)[0]["generated_text"]

    # Optional: parse out only text after "Answer:"
    if "Answer:" in response:
        response = response.split("Answer:", 1)[-1].strip()
    return response

# 8. GENERATE ANSWERS FROM BOTH MODELS
print("\nGenerating answers with GPT-Neo-125M...")
gpt_neo_125m_answers = [generate_answer(gpt_neo_125m_pipeline, q) for q in questions]

print("\nGenerating answers with BLOOM-560m...")
bloom_560m_answers = [generate_answer(bloom_560m_pipeline, q) for q in questions]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generating answers with GPT-Neo-125M...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strat


Generating answers with BLOOM-560m...


In [None]:
#gpt_neo_125m_answers
#bloom_560m_answers

In [None]:

# 9. EVALUATE WITH BERTScore
print("\nEvaluating answers using BERTScore...")
scores_gpt_neo_125m = score(gpt_neo_125m_answers, ground_truth_answers, lang="en")
scores_bloom_560m = score(bloom_560m_answers, ground_truth_answers, lang="en")

print("\nGPT-Neo-125M BERTScore:")
print(f"  Precision: {scores_gpt_neo_125m[0].mean():.4f}")
print(f"  Recall:    {scores_gpt_neo_125m[1].mean():.4f}")
print(f"  F1:        {scores_gpt_neo_125m[2].mean():.4f}")

print("\nBLOOM-560m BERTScore:")
print(f"  Precision: {scores_bloom_560m[0].mean():.4f}")
print(f"  Recall:    {scores_bloom_560m[1].mean():.4f}")
print(f"  F1:        {scores_bloom_560m[2].mean():.4f}")


Evaluating answers using BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



GPT-Neo-125M BERTScore:
  Precision: 0.8214
  Recall:    0.8454
  F1:        0.8332

BLOOM-560m BERTScore:
  Precision: 0.8284
  Recall:    0.8560
  F1:        0.8419


In [None]:
import pandas as pd
from IPython.display import display, HTML

# Create a DataFrame with all necessary columns
comparison_df = pd.DataFrame({
    "Question": questions,
    "Ground Truth": ground_truth_answers,
    "GPT-Neo-125M Answer": gpt_neo_125m_answers,
    "BLOOM-560m Answer": bloom_560m_answers
})

print("\nSide-by-Side Comparison of Generated Answers (DataFrame):")
display(comparison_df)  # Renders a table in Jupyter/Colab


Side-by-Side Comparison of Generated Answers (DataFrame):


Unnamed: 0,Question,Ground Truth,GPT-Neo-125M Answer,BLOOM-560m Answer
0,What are the primary benefits of solar energy?,"Solar energy is renewable, reduces electricity...",The primary benefit is that you get a lot of s...,"For the majority of homeowners, a solar system..."
1,How does regular exercise impact mental health?,"Regular exercise releases endorphins, reducing...",Regular-level physical activity (PA) is one of...,Regular exercise improves blood glucose levels...
2,What are the challenges of urbanization?,"Urbanization leads to overcrowding, increased ...",It is true that urbanization will result in a ...,The urbanization of our country has been made ...
3,Can you explain the process of photosynthesis?,"Photosynthesis converts light energy, water, a...",The photosynthetic process is the process of c...,Photosynthesis is an integral part of life. In...
4,What are the effects of climate change on pola...,Climate change in polar regions leads to melti...,Climate change can have a significant impact o...,"First, it follows that the regional climate ch..."
5,How does digital marketing differ from traditi...,Digital marketing utilizes online platforms an...,The main difference between print and digital ...,Digital marketing is a form of marketing that ...
6,What role do proteins play in the human body?,Proteins are crucial for building and repairin...,The human body has a lot of proteins that play...,We have seen that proteins play important role...
7,What steps can individuals take to reduce thei...,Reducing carbon footprints involves adopting r...,"The more you consume, the less you're contribu...",We need to make all our decisions in the same ...
8,How is artificial intelligence transforming th...,AI in healthcare enhances diagnostic precision...,Artificial intelligence is making healthcare m...,Artificial intelligence creates more data than...
9,What are the effects of deforestation on the e...,"Deforestation leads to biodiversity loss, disr...",The effect does not depend on the actual fores...,"The deforestation can cause many problems, suc..."
