# Summary 

This project is a sample chatbot project that provide answers to Computer Science domain questions 

# Checking CUDA availability

In [4]:
import torch 
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("Torch Version:", torch.__version__)

CUDA Available: False
CUDA Version: None
Device Name: CPU
Torch Version: 2.5.1+cpu


# Installing Necessary Libraries

In [6]:
# For NLP libraries
!pip install spacy
!pip install nltk
!pip install transformers
!pip install torch

# For Streamlit (optional for frontend)
!pip install streamlit

!pip install evaluate
!pip install ipywidgets
!pip install pyrebase4
!pip install --upgrade streamlit




# Importing Libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
from transformers import pipeline
import torch
import streamlit
import os
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, random_split
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM
import torch.optim as optim
from evaluate import load
import glob
from torch.cuda.amp import autocast, GradScaler
from datasets import Dataset
from datasets import load_dataset
import evaluate 
import random
from tqdm import tqdm
from transformers import DataCollatorForSeq2Seq
import json


# Viewing Dataframe and convering to hugging face dataset

In [10]:
# Load dataset
file_path = "C:/Users/DELL/Desktop/CS QnA/final_dataset.csv"  # Update the path if needed
df = pd.read_csv(file_path)
df.sample(10)

Unnamed: 0,question,answer
7240,How do you rename a directory using the mv com...,Rename a directory with `mv oldfolder newfolde...
10078,What are the operations of DSU?,"DSU primarily supports two operations: 'find',..."
14972,What are Features of Bitmap Indexing?,They are space efficient for columns with low ...
3578,What is a choropleth map?,A choropleth map uses shade shading or styles ...
294,What platforms are supported by Elastic Beanst...,Elastic Beanstalk supports various platforms f...
10008,What is a doubly linked list?,A doubly linked list is a type of linked list ...
17267,How to implement the Stack data structure in J...,The Stack data structure is a linear data stru...
16346,How can AI contribute to the creation of perso...,"AI can analyze sensory preferences, recommend ..."
11187,What is segmentation in an operating system?,Segmentation divides a program’s memory into l...
7427,How do you use a for loop to iterate through a...,"""Create an array or list, then use `for` follo..."


In [11]:
df.shape

(20076, 2)

In [12]:
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Checking Tokens

In [14]:
import statistics

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

def add_token_length(example):
    # Convert each row's question/answer to string, tokenize, and store lengths
    example["question_tokens"] = len(tokenizer.tokenize(str(example["question"])))
    example["answer_tokens"] = len(tokenizer.tokenize(str(example["answer"])))
    return example

# Use map() to add the new columns to your dataset
dataset = dataset.map(add_token_length)

# Now compute min and max over the entire dataset
q_min = min(dataset["question_tokens"])
q_max = max(dataset["question_tokens"])
a_min = min(dataset["answer_tokens"])
a_max = max(dataset["answer_tokens"])
q_avg = statistics.mean(dataset["question_tokens"])
a_avg = statistics.mean(dataset["answer_tokens"])


print(f"Question tokens: min={q_min}, max={q_max}, avg={q_avg}")
print(f"Answer tokens:   min={a_min}, max={a_max}, avg={a_avg}")

Map:   0%|          | 0/20076 [00:00<?, ? examples/s]

Question tokens: min=1, max=92, avg=11.61516238294481
Answer tokens:   min=1, max=4293, avg=65.34085475194262


# Tokenization

In [16]:
def tokenize_batch(batch):
    # Tokenize questions in batch
    tokenized_questions = tokenizer(
        batch["question"],
        padding="max_length",
        truncation=True,
        max_length=100
    )
    
    # Tokenize answers in batch; here truncation is disabled for answers
    tokenized_answers = tokenizer(
        batch["answer"],
        padding="max_length",  # Padding will apply for sequences shorter than max_length
        truncation=True,      # Answers will not be truncated even if longer than max_length
        max_length=512         # Only used for padding; full sequence is kept if longer
    )
    
    return {
        "question_input_ids": tokenized_questions["input_ids"],
        "question_attention_mask": tokenized_questions["attention_mask"],
        "answer_input_ids": tokenized_answers["input_ids"],
        "answer_attention_mask": tokenized_answers["attention_mask"],
    }

# Apply tokenization to the dataset in batches
tokenized_dataset = dataset.map(tokenize_batch, batched=True)

# Inspect the first example to verify tokenization
print(tokenized_dataset[0])


Map:   0%|          | 0/20076 [00:00<?, ? examples/s]

{'question': 'What are the prerequisites to learn Microsoft Azure?', 'answer': 'To learn Microsoft Azure there is no need of any extra knowledge any one can easily learn the Azure if they have the below listed set of skills. Basic understanding of Azure concept.Understanding of Cloud ConceptsUnderstanding if basic infrastructure management, database management, and software development.', 'question_tokens': 10, 'answer_tokens': 52, 'question_input_ids': [0, 2264, 32, 5, 1198, 44877, 7, 1532, 3709, 25959, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'question_attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Train test validation split

In [18]:
# First, split out a fixed test set of 20 random examples.
split_dataset = tokenized_dataset.train_test_split(test_size=20, seed=42)
test_dataset = split_dataset["test"]
remaining_dataset = split_dataset["train"]

# Now, split the remaining dataset into 90% training and 10% validation.
train_valid_split = remaining_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_valid_split["train"]
validation_dataset = train_valid_split["test"]

# Print out the sizes for verification:
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(validation_dataset)}")
print(f"Test size: {len(test_dataset)}")

Train size: 18050
Validation size: 2006
Test size: 20


# Loading Model 

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load DistilBART model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base").to(device)

Using device: cpu


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [21]:
def format_for_trainer(example):
    return {
        "input_ids": example["question_input_ids"],
        "attention_mask": example["question_attention_mask"],
        "labels": example["answer_input_ids"],
        "question": example["question"],    # Keep original text for evaluation
        "answer": example["answer"]
    }

# Apply mapping without removing columns completely
train_dataset = train_dataset.map(format_for_trainer)
validation_dataset = validation_dataset.map(format_for_trainer)
test_dataset = test_dataset.map(format_for_trainer)


Map:   0%|          | 0/18050 [00:00<?, ? examples/s]

Map:   0%|          | 0/2006 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# Defining Dataloader

In [23]:
# Create the data collator using your tokenizer and model
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define a custom collate function to remove the raw text fields
def custom_collate_fn(features):
    # Remove the raw text keys ("question" and "answer")
    cleaned_features = [
        {k: v for k, v in feature.items() if k not in ["question", "answer"]}
        for feature in features
    ]
    return data_collator(cleaned_features)

# Define DataLoader with the custom collate_fn to convert lists to tensors
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)


# Evaluating model before training 

In [25]:
# For reproducibility
random.seed(42)

# Load the ROUGE metric using the Hugging Face evaluate library.
rouge = evaluate.load("rouge")

# Randomly select 5 examples from your test dataset.
sample_indices = random.sample(range(len(test_dataset)), 5)

for idx in sample_indices:
    example = test_dataset[idx]
    
    # Extract the question and reference answer
    question_text = example["question"]
    reference_answer = example["answer"]
    
    print("Question:", question_text)
    print("Reference Answer:", reference_answer)
    
    # Tokenize the question with a max length of 100 tokens
    inputs = tokenizer(
        question_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=100  # Input question length set to 100 tokens
    ).to(device)
    
    # Generate an answer with a maximum of 512 tokens
    generated_ids = model.generate(
        **inputs,
        max_length=512,   # Generated answer can be up to 512 tokens
        num_beams=5,      # Using beam search for improved output quality
        early_stopping=True
    )
    
    # Decode the generated tokens into text
    generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("Generated Answer:", generated_answer)
    
    # Compute the ROUGE score between the generated answer and the reference answer.
    result = rouge.compute(predictions=[generated_answer], references=[reference_answer])
    print("ROUGE Score:", result)
    print("-" * 50)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Question: Can you explain the concept of encapsulation in object-oriented programming?
Reference Answer: Encapsulation is the concept of bundling data and functions (methods) together in a single unit known as a class in object-oriented programming. It allows for better organization and reusability of code by encapsulating data and functionality within a single entity.
Generated Answer: Can you explain the concept of encapsulation in object-oriented programming?
ROUGE Score: {'rouge1': 0.3076923076923077, 'rouge2': 0.2, 'rougeL': 0.2692307692307692, 'rougeLsum': 0.2692307692307692}
--------------------------------------------------
Question: What shape will the product matrix C have if matrix A is of shape m x n and matrix B is of shape n x p?
Reference Answer: The product matrix C will be of shape m x p.
Generated Answer: What shape will the product matrix C have if matrix A is of shape m x n and matrix B is ofshape n x p?
ROUGE Score: {'rouge1': 0.5555555555555556, 'rouge2': 0.411764

# Training the Model to determine Epoch 

In [27]:
import glob  # Used to find checkpoint files

# Set training parameters
num_epochs = 30
learning_rate = 5e-5

# Create the directory to store checkpoints and metrics if it doesn't exist
checkpoint_dir = "CS QnA"
os.makedirs(checkpoint_dir, exist_ok=True)
best_model_path = os.path.join(checkpoint_dir, "best_model.pt")
metrics_path = os.path.join(checkpoint_dir, "best_metrics.json")

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Load the ROUGE metric (we'll use ROUGE-L as our key metric here)
rouge_metric = evaluate.load("rouge")

# ----- Find the latest checkpoint -----
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "checkpoint_epoch_*.pt"))
latest_epoch = 0  # Start from the first epoch if no checkpoint exists

if checkpoint_files:
    # Sort checkpoints by epoch number
    checkpoint_files.sort(key=lambda x: int(x.split("_epoch_")[-1].split(".pt")[0]))
    latest_checkpoint = checkpoint_files[-1]  # Get the most recent checkpoint
    latest_epoch = int(latest_checkpoint.split("_epoch_")[-1].split(".pt")[0])  # Extract the epoch number
    model.load_state_dict(torch.load(latest_checkpoint, weights_only=True))
    print(f"Loaded checkpoint from {latest_checkpoint}. Resuming training from epoch {latest_epoch + 1}.")
else:
    print("No previous checkpoints found. Starting training from scratch.")

# --- Load best metrics if they exist ---
if os.path.exists(metrics_path):
    with open(metrics_path, "r") as f:
        best_metrics = json.load(f)
    best_rouge = best_metrics.get("best_rouge", 0.0)
    best_epoch = best_metrics.get("best_epoch", 0)
    print(f"Loaded best metrics: best_rouge = {best_rouge}, best_epoch = {best_epoch}")
else:
    best_rouge = 0.0
    best_epoch = 0

# ----- Start Training Loop -----
for epoch in range(latest_epoch + 1, num_epochs + 1):  # Start from next epoch
    print(f"\n===== Epoch {epoch}/{num_epochs} =====")
    model.train()
    total_train_loss = 0.0

    # ----- Training Phase -----
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        # Move batch tensors to GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass (model returns loss because we supply labels)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # ----- Save Checkpoint for This Epoch -----
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Checkpoint saved at: {checkpoint_path}")

    # ----- Evaluation Phase -----
    model.eval()
    rouge_scores = []

    # Run validation on all samples in the validation dataset
    for sample in validation_dataset:
        # Assuming the original text is preserved in these keys
        question_text = sample["question"]
        reference_answer = sample["answer"]

        # Tokenize question (max 100 tokens)
        inputs = tokenizer(
            question_text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=100
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Generate answer with max_length=512, beam search for better quality
        generated_ids = model.generate(
            **inputs,
            max_length=512,
            num_beams=5,
            early_stopping=True
        )

        generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Compute ROUGE score for this sample (using ROUGE-L)
        result = rouge_metric.compute(predictions=[generated_answer], references=[reference_answer])
        sample_rouge = result["rougeL"]  # You can choose another metric or average multiple
        rouge_scores.append(sample_rouge)

    avg_rouge = np.mean(rouge_scores)
    print(f"Epoch {epoch} Average ROUGE-L: {avg_rouge:.4f}")

    # ----- Log the ROUGE Score -----
    with open(os.path.join(checkpoint_dir, "rouge_log.txt"), "a") as f:
        f.write(f"Epoch {epoch}: Average ROUGE-L = {avg_rouge:.4f}\n")

    # ----- Check for Improvement -----
    if avg_rouge > best_rouge:
        best_rouge = avg_rouge
        best_epoch = epoch
        torch.save(model.state_dict(), best_model_path)
        print("New best model found and saved!")

    # Save the updated best metrics so they can be loaded next time  
    with open(metrics_path, "w") as f:
        json.dump({"best_rouge": best_rouge, "best_epoch": best_epoch}, f)

print(f"\nTraining complete. Best model from epoch {best_epoch} with ROUGE-L: {best_rouge:.4f}")


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

# Evaluating Random 5 Test Dataset 

In [28]:
# Set checkpoint directory
checkpoint_dir = "CS QnA"
best_model_path = os.path.join(checkpoint_dir, "best_model.pt")

# Check if best_model.pt exists
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path, weights_only=True))  # Load the best-performing model
    print(f"✅ Loaded best model from {best_model_path}")
else:
    print("⚠️ No best model found! Make sure training has completed and best_model.pt exists.")
    exit()

# Set model to evaluation mode
model.eval()

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Randomly select 5 examples from test dataset
random.seed(42)  # Ensure reproducibility
sample_indices = random.sample(range(len(test_dataset)), 5)

for idx in sample_indices:
    example = test_dataset[idx]
    
    # Extract question and reference answer
    question_text = example["question"]
    reference_answer = example["answer"]
    
    print("\n🔹 **Question:**", question_text)
    print("📝 **Reference Answer:**", reference_answer)
    
    # Tokenize question
    inputs = tokenizer(
        question_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=100
    ).to(device)

    # Generate an answer from the best model
    generated_ids = model.generate(
        **inputs,
        max_length=512,
        num_beams=5,
        early_stopping=True
    )

    # Decode generated answer
    generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    print("🤖 **Generated Answer:**", generated_answer)
    
    # Compute ROUGE score
    result = rouge.compute(predictions=[generated_answer], references=[reference_answer])
    print("📊 **ROUGE Score:**", result)
    print("-" * 50)


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
# Set checkpoint directory
checkpoint_dir = "CS QnA"
best_model_path = os.path.join(checkpoint_dir, "best_model.pt")

# Check if best_model.pt exists
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path, weights_only=True))  # Load the best-performing model
    print(f"✅ Loaded best model from {best_model_path}")
else:
    print("⚠️ No best model found! Make sure training has completed and best_model.pt exists.")
    exit()

# Set model to evaluation mode
model.eval()

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Initialize lists for storing predictions and references
all_predictions = []
all_references = []

# Loop over every example in the test dataset
for example in test_dataset:
    # Extract question and reference answer
    question_text = example["question"]
    reference_answer = example["answer"]
    
    # Tokenize question
    inputs = tokenizer(
        question_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=100
    ).to(device)

    # Generate an answer using the model
    generated_ids = model.generate(
        **inputs,
        max_length=512,
        num_beams=5,
        early_stopping=True
    )

    # Decode the generated answer
    generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    # Append the generated answer and reference to their respective lists
    all_predictions.append(generated_answer)
    all_references.append(reference_answer)

# Compute the average ROUGE score for the entire test dataset
avg_result = rouge.compute(predictions=all_predictions, references=all_references)
print("📊 **Average ROUGE Score:**", avg_result)

# Self-testing for Performance

In [30]:
import os
import torch

# Set checkpoint directory and best model path
checkpoint_dir = "CS QnA"
best_model_path = os.path.join(checkpoint_dir, "best_model.pt")

# Check if best_model.pt exists and load it
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path, weights_only=True))  # Load best-performing model
    print(f"✅ Loaded best model from {best_model_path}")
else:
    print("⚠️ No best model found! Make sure training has completed and best_model.pt exists.")
    exit()

# Set model to evaluation mode
model.eval()

# Interactive question input loop
while True:
    user_question = input("\n💬 Enter your question (or type 'exit' to quit): ").strip()
    
    if user_question.lower() == "exit":
        print("👋 Exiting...")
        break
    
    # Tokenize the input question
    inputs = tokenizer(
        user_question,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=100
    ).to(device)

    # Generate an answer from the model
    generated_ids = model.generate(
        **inputs,
        max_length=512,
        num_beams=5,  # Using beam search for better quality
        early_stopping=True
    )

    # Decode the generated answer
    generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    print("\n🤖 **Generated Answer:**", generated_answer)
    print("-" * 50)


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

# Merging train and validation dataset for final training 

In [None]:
from datasets import concatenate_datasets

# Merge train and validation datasets
merged_train_dataset = concatenate_datasets([train_dataset, validation_dataset])

# Print out the new sizes for verification
print(f"Merged Train size: {len(merged_train_dataset)}")
print(f"Test size (unchanged): {len(test_dataset)}")


# Dataloader 

In [None]:
# Create the data collator using your tokenizer and model
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define a custom collate function to remove the raw text fields ("question" and "answer")
def custom_collate_fn(features):
    cleaned_features = [
        {k: v for k, v in feature.items() if k not in ["question", "answer"]}
        for feature in features
    ]
    return data_collator(cleaned_features)

# Create DataLoaders using the merged training dataset and the test dataset
train_loader = DataLoader(merged_train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)

# Final Training 

In [None]:
# Set training parameters
num_epochs = 21
learning_rate = 5e-5

# Create the directory to store checkpoints and the final model
checkpoint_dir = "final_model"
os.makedirs(checkpoint_dir, exist_ok=True)
final_model_name = "final_trained_model"

# Create or open a log file for training loss
loss_log_path = os.path.join(checkpoint_dir, "training_loss_log.txt")
with open(loss_log_path, "a") as log_file:
    log_file.write("Training Loss Log:\n")

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# ----- Find the latest checkpoint if available -----
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, f"{final_model_name}_epoch_*.pt"))
latest_epoch = 0  # Start from scratch if no checkpoint exists

if checkpoint_files:
    # Sort checkpoints by epoch number
    checkpoint_files.sort(key=lambda x: int(x.split("_epoch_")[-1].split(".pt")[0]))
    latest_checkpoint = checkpoint_files[-1]
    latest_epoch = int(latest_checkpoint.split("_epoch_")[-1].split(".pt")[0])
    model.load_state_dict(torch.load(latest_checkpoint))
    print(f"Loaded checkpoint from {latest_checkpoint}. Resuming training from epoch {latest_epoch + 1}.")
else:
    print("No previous checkpoints found. Starting training from scratch.")

# ----- Start Training Loop -----
for epoch in range(latest_epoch + 1, num_epochs + 1):  # Start from the next epoch
    print(f"\n===== Epoch {epoch}/{num_epochs} =====")
    model.train()
    total_train_loss = 0.0

    # ----- Training Phase -----
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        # Move batch tensors to GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass (model returns loss because we supply labels)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Log the training loss for this epoch in human-readable format
    with open(loss_log_path, "a") as log_file:
        log_file.write(f"Epoch {epoch}: Training loss = {avg_train_loss:.4f}\n")

    # ----- Save Checkpoint for This Epoch -----
    checkpoint_path = os.path.join(checkpoint_dir, f"{final_model_name}_epoch_{epoch}.pt")
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Checkpoint saved at: {checkpoint_path}")

print(f"\nTraining complete after {num_epochs} epochs.")


In [None]:
import os
import glob
import torch

# Set checkpoint directory and final model name (should match training code)
checkpoint_dir = "final_model"
final_model_name = "final_trained_model"

# Construct checkpoint file pattern and search for checkpoints
checkpoint_pattern = os.path.join(checkpoint_dir, f"{final_model_name}_epoch_*.pt")
checkpoint_files = glob.glob(checkpoint_pattern)

if checkpoint_files:
    # Sort checkpoints by epoch number and select the latest checkpoint
    checkpoint_files.sort(key=lambda x: int(x.split("_epoch_")[-1].split(".pt")[0]))
    latest_checkpoint = checkpoint_files[-1]
    model.load_state_dict(torch.load(latest_checkpoint))
    print(f"✅ Loaded model from checkpoint: {latest_checkpoint}")
else:
    print("⚠️ No checkpoints found. Please ensure training has been completed.")
    exit()

# Set model to evaluation mode
model.eval()

# Interactive question input loop
while True:
    user_question = input("\n💬 Enter your question (or type 'exit' to quit): ").strip()
    
    if user_question.lower() == "exit":
        print("👋 Exiting...")
        break
    
    # Tokenize the input question
    inputs = tokenizer(
        user_question,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=100
    ).to(device)

    # Generate an answer from the model
    generated_ids = model.generate(
        **inputs,
        max_length=512,
        num_beams=5,      # Using beam search for better quality
        early_stopping=True
    )

    # Decode the generated answer
    generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    print("\n🤖 **Generated Answer:**", generated_answer)
    print("-" * 50)


# Final Evaluation 

In [None]:
import os
import glob
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate

############################################
# 1. Load your fine-tuned model & tokenizer
############################################

def load_model_and_tokenizer():
    model_name = "facebook/bart-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Locate the latest checkpoint in your "final_model" folder
    checkpoint_dir = "final_model"
    model_prefix = "final_trained_model"
    checkpoint_pattern = os.path.join(checkpoint_dir, f"{model_prefix}_epoch_*.pt")
    checkpoint_files = glob.glob(checkpoint_pattern)

    if checkpoint_files:
        checkpoint_files.sort(key=lambda x: int(x.split("_epoch_")[-1].split(".pt")[0]))
        best_model_path = checkpoint_files[-1]
        model.load_state_dict(torch.load(best_model_path, map_location=device))
        print(f"Loaded fine-tuned model from {best_model_path}")
    else:
        raise ValueError("No trained model checkpoint found in 'final_model' directory!")

    model.eval()
    return model, tokenizer, device

model, tokenizer, device = load_model_and_tokenizer()

############################################
# 2. Define a function to generate answers
############################################

def generate_answer(question: str) -> str:
    inputs = tokenizer(
        question,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=100
    ).to(device)

    # Generate answer with min_length increased to 200 and max_length set to 512 tokens
    generated_ids = model.generate(
        **inputs,
        min_length=200,
        max_length=512,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

############################################
# 3. Create 20 comprehensive computer science Q&A pairs
#    (using triple-quoted strings for the reference answers)
############################################

sample_data = [
    {
        "question": "What is a computer?",
        "answer": """A computer is an electronic device that processes data and performs calculations according to a set of instructions.
It consists of hardware components, such as the CPU, memory, and storage, and software that runs applications and manages system resources.
Computers are used in various fields to perform tasks ranging from simple calculations to complex simulations."""
    },
    {
        "question": "What is hardware?",
        "answer": """Hardware refers to the physical components of a computer system.
This includes devices like the processor, memory modules, storage drives, and input/output peripherals.
These tangible parts work together to enable the computer to function efficiently."""
    },
    {
        "question": "What is software?",
        "answer": """Software is a collection of programs, procedures, and routines that instruct a computer on how to perform tasks.
It includes operating systems, applications, and utilities.
Software development involves coding, testing, and maintaining these programs."""
    },
    {
        "question": "What is an operating system?",
        "answer": """An operating system (OS) is system software that manages computer hardware, software resources, and provides common services for computer programs.
It acts as an intermediary between applications and computer hardware.
Examples include Windows, macOS, and Linux."""
    },
    {
        "question": "What is programming?",
        "answer": """Programming is the process of designing, writing, testing, and maintaining code that instructs a computer to perform specific tasks.
It involves using programming languages to develop software applications and solve problems systematically."""
    },
    {
        "question": "What is software engineering?",
        "answer": """Software engineering is the disciplined application of engineering principles to design, develop, test, and maintain software.
It focuses on creating reliable, scalable, and efficient software solutions through systematic processes and methodologies.
Software engineers work on everything from application development to system architecture."""
    },
    {
        "question": "What is data science?",
        "answer": """Data science is an interdisciplinary field that uses scientific methods, processes, and algorithms to extract knowledge and insights from data.
It combines techniques from statistics, computer science, and domain expertise to analyze complex datasets.
Data science is widely applied in business, research, and technology."""
    },
    {
        "question": "What is machine learning?",
        "answer": """Machine learning is a subset of artificial intelligence that enables systems to learn from data and improve over time without explicit programming.
It uses statistical techniques to identify patterns and make predictions or decisions.
Applications include image recognition, NLP, and recommendation systems."""
    },
    {
        "question": "What is natural language processing (NLP)?",
        "answer": """Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and human language.
It involves enabling computers to understand, interpret, and generate human language.
NLP is used in applications like sentiment analysis, translation, and chatbots."""
    },
    {
        "question": "What is deep learning?",
        "answer": """Deep learning is an advanced branch of machine learning that uses neural networks with multiple layers to model complex patterns in data.
It has been particularly effective in image and speech recognition, NLP, and autonomous systems.
Deep learning typically requires large datasets and high computational power."""
    },
    {
        "question": "What is a CPU?",
        "answer": """The CPU (Central Processing Unit) is the primary component of a computer responsible for executing instructions and performing calculations.
It acts as the brain of the computer, coordinating tasks between hardware and software.
A faster CPU improves overall system performance."""
    },
    {
        "question": "What is RAM?",
        "answer": """RAM (Random Access Memory) is volatile memory used by a computer to temporarily store data that is actively used.
It allows the CPU quick access to information and supports multitasking.
The amount of RAM impacts the system's ability to run multiple applications simultaneously."""
    },
    {
        "question": "What is a hard drive?",
        "answer": """A hard drive is a storage device that uses magnetic storage to store and retrieve digital information.
It provides long-term storage and retains data even when the computer is powered off.
Modern hard drives include traditional HDDs and faster SSDs."""
    },
    {
        "question": "What is a network?",
        "answer": """A network is a collection of interconnected computers that share resources and communicate with each other.
Networks can range from small local area networks (LANs) to global wide area networks (WANs).
They are essential for data sharing and communication."""
    },
    {
        "question": "What is the internet?",
        "answer": """The internet is a global network of interconnected computers that communicate using standardized protocols.
It enables the sharing of information, communication, and access to a wide range of online services.
The internet has revolutionized how we access and distribute information."""
    },
    {
        "question": "What is a database?",
        "answer": """A database is an organized collection of data that is stored electronically and structured for efficient access.
It allows users to store, query, and manipulate data using structured query languages (SQL).
Databases are fundamental to various applications from business operations to research."""
    },
    {
        "question": "What is a web browser?",
        "answer": """A web browser is a software application that allows users to access and navigate the internet.
It retrieves, interprets, and displays web pages, making online content accessible.
Popular browsers include Chrome, Firefox, and Safari."""
    },
    {
        "question": "What is cloud computing?",
        "answer": """Cloud computing is the delivery of computing services such as storage, processing, and software over the internet.
It allows users to access and use these resources on demand without maintaining physical hardware.
Cloud computing offers scalability, flexibility, and cost efficiency."""
    },
    {
        "question": "What is cybersecurity?",
        "answer": """Cybersecurity is the practice of protecting computers, networks, and data from cyber attacks, unauthorized access, and damage.
It involves implementing technologies, processes, and policies to safeguard digital assets.
Cybersecurity is crucial for maintaining privacy and system integrity in a digital world."""
    },
    {
        "question": "What is an API?",
        "answer": """An API (Application Programming Interface) is a set of protocols and tools that enables different software applications to communicate with each other.
It defines methods and data formats for requesting and exchanging information between systems.
APIs are integral in web development, mobile applications, and cloud services."""
    },
]

############################################
# 4. Generate predictions & compute ROUGE
############################################

rouge = evaluate.load("rouge")

predictions = []
references = []

for sample in sample_data:
    question = sample["question"]
    gold_answer = sample["answer"]

    # Generate the model's answer (with max_length set to 512 tokens)
    model_answer = generate_answer(question)
    predictions.append(model_answer)
    references.append(gold_answer)

results = rouge.compute(predictions=predictions, references=references)

############################################
# 5. Print out the average ROUGE results
############################################

print("ROUGE Results (averaged over 20 samples):")
for metric_name, score_value in results.items():
    print(f"{metric_name}: {score_value:.4f}")


In [None]:
!streamlit run app.py