In [1]:
# Install required libraries
!pip install transformers datasets evaluate spacy rouge_score sacrebleu
!pip install datasets
!python -m spacy download en_core_web_sm
!pip install evaluate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)

In [2]:
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from evaluate import load
import spacy
from datasets import load_dataset

In [3]:
# Mount Google Drive
drive.mount("/content/drive")

# Paths to datasets
train_file_path = '/content/drive/MyDrive/Projet_LLM_UPVD/train.csv'
validation_file_path = '/content/drive/MyDrive/Projet_LLM_UPVD/validation.csv'
model_save_path = '/content/drive/MyDrive/fine_tuned_model'

# Load SpaCy model for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Load and preprocess datasets
def load_and_preprocess_dataset(train_path, val_path):
    train_dataset = load_dataset('csv', data_files={'train': train_path})['train']
    val_dataset = load_dataset('csv', data_files={'validation': val_path})['validation']
    train_dataset = train_dataset.map(lambda x: {"context_chunks": preprocess_text(x["context_chunks"])})
    val_dataset = val_dataset.map(lambda x: {"context_chunks": preprocess_text(x["context_chunks"])})
    return train_dataset, val_dataset

train_dataset, val_dataset = load_and_preprocess_dataset(train_file_path, validation_file_path)


Mounted at /content/drive


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1348 [00:00<?, ? examples/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

In [4]:
# Load the lightweight model for Q&A
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Create QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Generate baseline output for text continuation
input_text = "COVID-19, caused by the SARS-CoV-2 virus, emerged in late 2019 in Wuhan, China, and quickly became a global pandemic. The virus is primarily transmitted through respiratory droplets..."
print("\nBaseline Text Continuation:")
generated_output = qa_pipeline(question="What is COVID-19?", context=input_text)["answer"]
print(f"Generated Text: {generated_output}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Device set to use cuda:0



Baseline Text Continuation:
Generated Text: caused by the SARS-CoV-2 virus


In [5]:

# Define questions and context for evaluation
questions = [
    "What is the main cause of HIV-1 infection in children?",
    "What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?",
    "What is DC-GENR and where is it expressed?"
]
context = """
HIV-1 infection in children is primarily caused by mother-to-child transmission. CCL3L1 competes for CCR5 binding,
reducing the risk of HIV-1 infection. DC-GENR is expressed on dendritic cells and plays a key role in HIV recognition.
"""

In [6]:

# Generate answers with the baseline model
def generate_baseline_answers(questions, context):
    results = {}
    for question in questions:
        answer = qa_pipeline(question=question, context=context)["answer"]
        results[question] = answer
    return results


In [7]:
# Few-Shot Evaluation for the specific questions
def few_shot_evaluation(questions, context, n_shots):
    results = {}
    for i in range(n_shots):
        question = questions[i]
        answer = qa_pipeline(question=question, context=context)["answer"]
        results[question] = answer
    return results



In [8]:
# Generate and display baseline answers
print("\nBaseline Model Answers:")
baseline_answers = generate_baseline_answers(questions, context)
for q, ans in baseline_answers.items():
    print(f"Q: {q}\nA: {ans}\n")



Baseline Model Answers:
Q: What is the main cause of HIV-1 infection in children?
A: mother-to-child transmission

Q: What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?
A: plays a key role in HIV recognition

Q: What is DC-GENR and where is it expressed?
A: on dendritic cells



In [9]:

# Few-Shot Evaluation
def few_shot_evaluation(dataset, n_shots):
    results = {}
    for i in range(n_shots):
        question = dataset[i]["question"]
        context = dataset[i]["context_chunks"]
        answer = qa_pipeline(question=question, context=context)["answer"]
        results[question] = answer
    return results

print("\nFew-Shot Answers:")
for n_shots in range(1, 6):  # From 1 to 5 shots
    few_shot_results = few_shot_evaluation(train_dataset, n_shots)
    print(f"\nFew-Shot Evaluation ({n_shots} shots):")
    for q, ans in few_shot_results.items():
        print(f"Q: {q}\nAnswer: {ans}\n")



Few-Shot Answers:

Few-Shot Evaluation (1 shots):
Q: What is the main cause of HIV-1 infection in children?
Answer: background mother child transmission MTCT


Few-Shot Evaluation (2 shots):
Q: What is the main cause of HIV-1 infection in children?
Answer: background mother child transmission MTCT

Q: What plays the crucial role in the Mother to Child Transmission of HIV-1 and what increases the risk
Answer: DC SIGNR



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Few-Shot Evaluation (3 shots):
Q: What is the main cause of HIV-1 infection in children?
Answer: background mother child transmission MTCT

Q: What plays the crucial role in the Mother to Child Transmission of HIV-1 and what increases the risk
Answer: DC SIGNR

Q: How many children were infected by HIV-1 in 2008-2009, worldwide?
Answer: 90


Few-Shot Evaluation (4 shots):
Q: What is the main cause of HIV-1 infection in children?
Answer: background mother child transmission MTCT

Q: What plays the crucial role in the Mother to Child Transmission of HIV-1 and what increases the risk
Answer: DC SIGNR

Q: How many children were infected by HIV-1 in 2008-2009, worldwide?
Answer: 90

Q: What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?
Answer: quantitative


Few-Shot Evaluation (5 shots):
Q: What is the main cause of HIV-1 infection in children?
Answer: background mother child transmission MTCT

Q: What plays the crucial role in the M

In [10]:
import torch
from torch.utils.data import DataLoader

# Preprocess function for tokenization
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["question"], examples["context_chunks"],
        truncation=True, padding="max_length", max_length=512
    )
    start_positions = []
    end_positions = []

    for i in range(len(examples["context_chunks"])):
        # Ensure `answer` key exists and fallback to empty string
        answer = examples.get("answer", [""])[i]
        start_idx = examples["context_chunks"][i].find(answer)

        if start_idx == -1:  # If answer is not found in the context
            start_positions.append(0)
            end_positions.append(0)
            continue

        end_idx = start_idx + len(answer)
        tokenized_start = tokenizer(
            examples["context_chunks"][i], truncation=True, max_length=512
        ).char_to_token(start_idx)
        tokenized_end = tokenizer(
            examples["context_chunks"][i], truncation=True, max_length=512
        ).char_to_token(end_idx - 1)

        # Handle cases where char_to_token returns None
        start_positions.append(tokenized_start if tokenized_start is not None else 0)
        end_positions.append(tokenized_end if tokenized_end is not None else 0)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Fine-Tuning Loop
model.train()
for epoch in range(2):  # Two epochs
    for batch in train_loader:
        inputs = tokenizer(
            batch["question"], batch["context_chunks"],
            truncation=True, padding="max_length", return_tensors="pt"
        ).to(model.device)

        # Validate batch size consistency
        if len(batch["start_positions"]) != inputs["input_ids"].shape[0]:
            print(f"Skipping batch due to mismatched label sizes. Inputs: {inputs['input_ids'].shape[0]}, Labels: {len(batch['start_positions'])}")
            continue

        labels = {
            "start_positions": torch.tensor(batch["start_positions"]).to(model.device),
            "end_positions": torch.tensor(batch["end_positions"]).to(model.device),
        }

        # Forward pass
        outputs = model(**inputs, start_positions=labels["start_positions"], end_positions=labels["end_positions"])
        loss = outputs.loss

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the Fine-Tuned Model
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Fine-tuned model saved to {model_save_path}")


Map:   0%|          | 0/1348 [00:00<?, ? examples/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

  "start_positions": torch.tensor(batch["start_positions"]).to(model.device),
  "end_positions": torch.tensor(batch["end_positions"]).to(model.device),


Fine-tuned model saved to /content/drive/MyDrive/fine_tuned_model


In [11]:
# Reload the fine-tuned model
model = AutoModelForQuestionAnswering.from_pretrained(model_save_path).to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Define questions and context for QA
questions = [
    "What is the main cause of HIV-1 infection in children?",
    "What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?",
    "What is DC-GENR and where is it expressed?"
]
context = """
HIV-1 infection in children is primarily caused by mother-to-child transmission. CCL3L1 competes for CCR5 binding,
reducing the risk of HIV-1 infection. DC-GENR is expressed on dendritic cells and plays a key role in HIV recognition.
"""

# Generate answers
def generate_answers(questions, context):
    return {q: qa_pipeline(question=q, context=context)["answer"] for q in questions}

answers = generate_answers(questions, context)

# Display answers
print("\nFine-Tuned Model Answers:")
for q, a in answers.items():
    print(f"Q: {q}\nA: {a}\n")


Device set to use cuda:0



Fine-Tuned Model Answers:
Q: What is the main cause of HIV-1 infection in children?
A: mother-to-child transmission

Q: What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?
A: HIV recognition.

Q: What is DC-GENR and where is it expressed?
A: dendritic cells



In [12]:
!pip install evaluate



In [13]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [14]:
import pandas as pd
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from evaluate import load
from sentence_transformers import SentenceTransformer, util

# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

# Define paths and models
validation_csv_path = '/content/drive/MyDrive/Projet_LLM_UPVD/validation.csv'  # Update with your dataset path
baseline_model_name = "distilbert-base-uncased-distilled-squad"  # Baseline model
fine_tuned_model_path = '/content/drive/MyDrive/fine_tuned_model'  # Fine-tuned model
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load data
df = pd.read_csv(validation_csv_path)
questions = df['question'].tolist()
contexts = [" ".join(eval(context)) for context in df['context_chunks']]  # Convert context_chunks into a single string
references = df['answer'].tolist()

# Validate loaded data
print("Questions:", questions[:5])
print("Contexts:", contexts[:5])
print("References:", references[:5])

# Initialize device and models
device = "cuda" if torch.cuda.is_available() else "cpu"
baseline_model = AutoModelForQuestionAnswering.from_pretrained(baseline_model_name).to(device)
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)

# Set up QA pipelines
baseline_pipeline = pipeline("question-answering", model=baseline_model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
fine_tuned_pipeline = pipeline("question-answering", model=fine_tuned_model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

# Generate predictions
baseline_predictions = [baseline_pipeline(question=q, context=c)['answer'] for q, c in zip(questions, contexts)]
fine_tuned_predictions = [fine_tuned_pipeline(question=q, context=c)['answer'] for q, c in zip(questions, contexts)]

# Load evaluation metric
bertscore_metric = load("bertscore")

# Format references properly
formatted_references = [" ".join(ref.split()) for ref in references]

# Compute BERTScore
bertscore_baseline = bertscore_metric.compute(predictions=baseline_predictions, references=formatted_references, lang="en")
bertscore_fine_tuned = bertscore_metric.compute(predictions=fine_tuned_predictions, references=formatted_references, lang="en")

# Ensure BERTScore does not exceed 1
bertscore_baseline["f1"] = [min(score, 1.0) for score in bertscore_baseline["f1"]]
bertscore_fine_tuned["f1"] = [min(score, 1.0) for score in bertscore_fine_tuned["f1"]]

# Display metrics comparison
print("\nMetrics Comparison:")
print(f"BERTScore (Baseline): {bertscore_baseline['f1']}")
print(f"BERTScore (Fine-Tuned): {bertscore_fine_tuned['f1']}")

import numpy as np

# Compute the average BERTScore
avg_bertscore_baseline = np.mean(bertscore_baseline["f1"])
avg_bertscore_fine_tuned = np.mean(bertscore_fine_tuned["f1"])

# Display the average scores
print(f"\nBERTScore (Baseline - Average): {avg_bertscore_baseline:.4f}")
print(f"BERTScore (Fine-Tuned - Average): {avg_bertscore_fine_tuned:.4f}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Questions: ['What enzymes have been reported to be linked with severity of infection and various pathological conditions caused by microorganisms?', 'At what temperatures was the assay completed?', 'What criteria sets the guideline for drug-like properties?', 'What could be novel candidates as potent inhibitors of papain like cysteine proteases in resistant microorganisms?', 'How long is the SAIBK gene?']
Contexts: ["Targeting papain family cysteine proteases is one of the novel strategies in the development of chemotherapy for a number of diseases. Novel cysteine protease inhibitors derived from 1-pyridylimidazo 1,5-a pyridine representing pharmacologically important class of compounds are being reported here for the first time. The derivatives were initially designed and screened in silico by molecular docking studies against papain to explore the possible mode of action. The molecular interaction between the compounds and cysteine protease papain was found to be very similar to the 

Device set to use cuda:0
Device set to use cuda:0


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Metrics Comparison:
BERTScore (Baseline): [0.9999999403953552, 0.8609034419059753, 0.7759363055229187, 0.7720378637313843, 0.8374530673027039, 1.0, 0.8094877600669861, 0.8619675040245056, 0.8474422693252563, 0.9060952067375183, 0.9766698479652405, 1.0, 0.8277749419212341, 0.7695192098617554, 0.7930305600166321, 0.9431020021438599, 0.9192315936088562, 0.8202833533287048, 0.8642914891242981, 0.8948193788528442, 0.8735920190811157, 0.8306127190589905, 1.0, 0.8240856528282166, 0.946638822555542, 0.8072283864021301, 0.8358676433563232, 0.8117049932479858, 0.9023283123970032, 0.8346841931343079, 1.0, 0.83904629945755, 0.8494483828544617, 0.8696756958961487, 0.7882095575332642, 0.8628208041191101, 0.8659210205078125, 0.952071487903595, 0.8127949833869934, 0.8192077279090881, 0.8667006492614746, 0.8102971315383911, 0.766778826713562, 0.8596206903457642, 0.8084956407546997, 0.7722041010856628, 0.8362985849380493, 0.8099246025085449, 0.9728811979293823, 0.8095301985740662, 0.8243696689605713, 0