In [None]:
# Install required libraries
!pip install transformers datasets evaluate spacy rouge_score sacrebleu
!pip install datasets
!python -m spacy download en_core_web_sm
!pip install evaluate

In [2]:
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from evaluate import load
import spacy
from datasets import load_dataset

In [None]:
# Mount Google Drive
drive.mount("/content/drive")

# Paths to datasets
train_file_path = '/content/drive/MyDrive/Projet_LLM_UPVD/train.csv'
validation_file_path = '/content/drive/MyDrive/Projet_LLM_UPVD/validation.csv'
model_save_path = '/content/drive/MyDrive/fine_tuned_model'

# Load SpaCy model for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Load and preprocess datasets
def load_and_preprocess_dataset(train_path, val_path):
    train_dataset = load_dataset('csv', data_files={'train': train_path})['train']
    val_dataset = load_dataset('csv', data_files={'validation': val_path})['validation']
    train_dataset = train_dataset.map(lambda x: {"context_chunks": preprocess_text(x["context_chunks"])})
    val_dataset = val_dataset.map(lambda x: {"context_chunks": preprocess_text(x["context_chunks"])})
    return train_dataset, val_dataset

train_dataset, val_dataset = load_and_preprocess_dataset(train_file_path, validation_file_path)


In [None]:
# Load the lightweight model for Q&A
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Create QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Generate baseline output for text continuation
input_text = "COVID-19, caused by the SARS-CoV-2 virus, emerged in late 2019 in Wuhan, China, and quickly became a global pandemic. The virus is primarily transmitted through respiratory droplets..."
print("\nBaseline Text Continuation:")
generated_output = qa_pipeline(question="What is COVID-19?", context=input_text)["answer"]
print(f"Generated Text: {generated_output}")

In [5]:

# Define questions and context for evaluation
questions = [
    "What is the main cause of HIV-1 infection in children?",
    "What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?",
    "What is DC-GENR and where is it expressed?"
]
context = """
HIV-1 infection in children is primarily caused by mother-to-child transmission. CCL3L1 competes for CCR5 binding,
reducing the risk of HIV-1 infection. DC-GENR is expressed on dendritic cells and plays a key role in HIV recognition.
"""

In [6]:

# Generate answers with the baseline model
def generate_baseline_answers(questions, context):
    results = {}
    for question in questions:
        answer = qa_pipeline(question=question, context=context)["answer"]
        results[question] = answer
    return results


In [7]:
# Few-Shot Evaluation for the specific questions
def few_shot_evaluation(questions, context, n_shots):
    results = {}
    for i in range(n_shots):
        question = questions[i]
        answer = qa_pipeline(question=question, context=context)["answer"]
        results[question] = answer
    return results



In [None]:
# Generate and display baseline answers
print("\nBaseline Model Answers:")
baseline_answers = generate_baseline_answers(questions, context)
for q, ans in baseline_answers.items():
    print(f"Q: {q}\nA: {ans}\n")


In [None]:

# Few-Shot Evaluation
def few_shot_evaluation(dataset, n_shots):
    results = {}
    for i in range(n_shots):
        question = dataset[i]["question"]
        context = dataset[i]["context_chunks"]
        answer = qa_pipeline(question=question, context=context)["answer"]
        results[question] = answer
    return results

print("\nFew-Shot Answers:")
for n_shots in range(1, 6):  # From 1 to 5 shots
    few_shot_results = few_shot_evaluation(train_dataset, n_shots)
    print(f"\nFew-Shot Evaluation ({n_shots} shots):")
    for q, ans in few_shot_results.items():
        print(f"Q: {q}\nAnswer: {ans}\n")


In [None]:
import torch
from torch.utils.data import DataLoader

# Preprocess function for tokenization
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["question"], examples["context_chunks"],
        truncation=True, padding="max_length", max_length=512
    )
    start_positions = []
    end_positions = []

    for i in range(len(examples["context_chunks"])):
        # Ensure `answer` key exists and fallback to empty string
        answer = examples.get("answer", [""])[i]
        start_idx = examples["context_chunks"][i].find(answer)

        if start_idx == -1:  # If answer is not found in the context
            start_positions.append(0)
            end_positions.append(0)
            continue

        end_idx = start_idx + len(answer)
        tokenized_start = tokenizer(
            examples["context_chunks"][i], truncation=True, max_length=512
        ).char_to_token(start_idx)
        tokenized_end = tokenizer(
            examples["context_chunks"][i], truncation=True, max_length=512
        ).char_to_token(end_idx - 1)

        # Handle cases where char_to_token returns None
        start_positions.append(tokenized_start if tokenized_start is not None else 0)
        end_positions.append(tokenized_end if tokenized_end is not None else 0)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Fine-Tuning Loop
model.train()
for epoch in range(2):  # Two epochs
    for batch in train_loader:
        inputs = tokenizer(
            batch["question"], batch["context_chunks"],
            truncation=True, padding="max_length", return_tensors="pt"
        ).to(model.device)

        # Validate batch size consistency
        if len(batch["start_positions"]) != inputs["input_ids"].shape[0]:
            print(f"Skipping batch due to mismatched label sizes. Inputs: {inputs['input_ids'].shape[0]}, Labels: {len(batch['start_positions'])}")
            continue

        labels = {
            "start_positions": torch.tensor(batch["start_positions"]).to(model.device),
            "end_positions": torch.tensor(batch["end_positions"]).to(model.device),
        }

        # Forward pass
        outputs = model(**inputs, start_positions=labels["start_positions"], end_positions=labels["end_positions"])
        loss = outputs.loss

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the Fine-Tuned Model
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Fine-tuned model saved to {model_save_path}")


In [None]:
# Reload the fine-tuned model
model = AutoModelForQuestionAnswering.from_pretrained(model_save_path).to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Define questions and context for QA
questions = [
    "What is the main cause of HIV-1 infection in children?",
    "What is the role of C-C Motif Chemokine Ligand 3 Like 1 (CCL3L1) in mother to child transmission of HIV-1?",
    "What is DC-GENR and where is it expressed?"
]
context = """
HIV-1 infection in children is primarily caused by mother-to-child transmission. CCL3L1 competes for CCR5 binding,
reducing the risk of HIV-1 infection. DC-GENR is expressed on dendritic cells and plays a key role in HIV recognition.
"""

# Generate answers
def generate_answers(questions, context):
    return {q: qa_pipeline(question=q, context=context)["answer"] for q in questions}

answers = generate_answers(questions, context)

# Display answers
print("\nFine-Tuned Model Answers:")
for q, a in answers.items():
    print(f"Q: {q}\nA: {a}\n")


In [None]:
!pip install evaluate

In [None]:
!pip install bert_score

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from evaluate import load
from sentence_transformers import SentenceTransformer, util

# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

# Define paths and models
validation_csv_path = '/content/drive/MyDrive/Projet_LLM_UPVD/validation.csv'  # Update with your dataset path
baseline_model_name = "distilbert-base-uncased-distilled-squad"  # Baseline model
fine_tuned_model_path = '/content/drive/MyDrive/fine_tuned_model'  # Fine-tuned model
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load data
df = pd.read_csv(validation_csv_path)
questions = df['question'].tolist()
contexts = [" ".join(eval(context)) for context in df['context_chunks']]  # Convert context_chunks into a single string
references = df['answer'].tolist()

# Validate loaded data
print("Questions:", questions[:5])
print("Contexts:", contexts[:5])
print("References:", references[:5])

# Initialize device and models
device = "cuda" if torch.cuda.is_available() else "cpu"
baseline_model = AutoModelForQuestionAnswering.from_pretrained(baseline_model_name).to(device)
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(baseline_model_name)

# Set up QA pipelines
baseline_pipeline = pipeline("question-answering", model=baseline_model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
fine_tuned_pipeline = pipeline("question-answering", model=fine_tuned_model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

# Generate predictions
baseline_predictions = [baseline_pipeline(question=q, context=c)['answer'] for q, c in zip(questions, contexts)]
fine_tuned_predictions = [fine_tuned_pipeline(question=q, context=c)['answer'] for q, c in zip(questions, contexts)]

# Load evaluation metric
bertscore_metric = load("bertscore")

# Format references properly
formatted_references = [" ".join(ref.split()) for ref in references]

# Compute BERTScore
bertscore_baseline = bertscore_metric.compute(predictions=baseline_predictions, references=formatted_references, lang="en")
bertscore_fine_tuned = bertscore_metric.compute(predictions=fine_tuned_predictions, references=formatted_references, lang="en")

# Ensure BERTScore does not exceed 1
bertscore_baseline["f1"] = [min(score, 1.0) for score in bertscore_baseline["f1"]]
bertscore_fine_tuned["f1"] = [min(score, 1.0) for score in bertscore_fine_tuned["f1"]]

# Display metrics comparison
print("\nMetrics Comparison:")
print(f"BERTScore (Baseline): {bertscore_baseline['f1']}")
print(f"BERTScore (Fine-Tuned): {bertscore_fine_tuned['f1']}")

import numpy as np

# Compute the average BERTScore
avg_bertscore_baseline = np.mean(bertscore_baseline["f1"])
avg_bertscore_fine_tuned = np.mean(bertscore_fine_tuned["f1"])

# Display the average scores
print(f"\nBERTScore (Baseline - Average): {avg_bertscore_baseline:.4f}")
print(f"BERTScore (Fine-Tuned - Average): {avg_bertscore_fine_tuned:.4f}")

