In [None]:
############################################
# Team : RAGrats      
# Team Members : Ali Asgar Padaria, Param Patel, Meet Zalavadiya
#                    
# Code Description : This file contains the code for the Baseline 2 Model - Specifically the generation part 
#                    This part follows the first part which generates relevant embeddings from the first part, 
#                    those embeddings are then used to generate answer explainations using a T5 LM model, 
#                    which are then passed to the pretrained RoBERTa classifier.
#                   
# NLP Concepts Usage: Tokenization, Embeddings, Language Modeling, Question Answering
#                       
# System : GCP Server L4 GPU
#############################################

In [None]:
# Import Necessary Libraries
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
from collections import defaultdict
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import json
from datasets import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from datasets import load_from_disk
import os
import matplotlib.pyplot as plt
from datasets import concatenate_datasets
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gen_tokenizer = AutoTokenizer.from_pretrained("t5-base")
gen_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to(device) # NLP Concept: Language Modeling
gen_model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [3]:
def generate_answer(question, contexts, max_input_len=512, max_output_len=32):
    # Concatenate contexts and prepend the question
    input_text = "question: " + question + " context: " + " ".join(contexts)
    inputs = gen_tokenizer(
        input_text, return_tensors="pt", truncation=True,
        padding=True, max_length=max_input_len
    ).to(device)

    with torch.no_grad():
        with torch.amp.autocast(device_type=device.type, dtype=torch.float16):
            output = gen_model.generate(
                **inputs, max_length=max_output_len,
                num_beams=4, early_stopping=True
            )
    return gen_tokenizer.decode(output[0], skip_special_tokens=True)

In [6]:
# File paths
input_path = "/home/apadaria/NLP_Project/source/files/val_retrieved_pairs_base_2.json"
output_path = "/home/apadaria/NLP_Project/source/files/baseline2_generated_rag_answers.json"

# Load pre-retrieved question-context pairs
with open(input_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)

generated_answers = []

for item in tqdm(dataset, desc="Generating RAG Answers"):
    question = item["question"]
    retrieved = item["retrieved_contexts"]  # Already retrieved contexts

    if not retrieved:
        continue

    # Generate answer using pre-retrieved contexts
    pred_answer = generate_answer(question, retrieved).strip()

    # Store question and generated answer
    generated_answers.append({
        "question": question,
        "generated_answer": pred_answer
    })

# Save generated answers
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(generated_answers, f, indent=2, ensure_ascii=False)

print(f"Saved {len(generated_answers)} question-answer pairs to {output_path}")


Generating RAG Answers:   0%|          | 0/2000 [00:00<?, ?it/s]

Generating RAG Answers: 100%|██████████| 2000/2000 [13:25<00:00,  2.48it/s]

Saved 2000 question-answer pairs to /home/apadaria/NLP_Project/source/files/baseline2_generated_rag_answers.json





In [8]:
# load valset
val_dataset = load_from_disk("../files/val_dataset")

In [None]:
generated_texts = [item["generated_answer"] for item in generated_answers]
val_dataset = val_dataset.add_column("generated_answer", generated_texts) # store generated texts in the validation dataset


In [16]:
val_dataset

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'generated_answer'],
    num_rows: 2000
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base") # NLP Concept : Tokenization

# Define label mapping
label_map = {"yes": 1, "no": 0}

classifer_model = AutoModelForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=1,
)
classifer_model.load_state_dict(torch.load("../files/roberta_classifier.pt", map_location=torch.device("cuda")))  # or "cuda"
classifer_model.eval()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  classifer_model.load_state_dict(torch.load("../files/roberta_classifier.pt", map_location=torch.device("cuda")))  # or "cuda"


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [18]:
def preprocess_function(example):
    # Use only questions, no context
    
    final_prompt = f"{example['question']}\n{example['generated_answer']}"
    inputs = tokenizer(
        final_prompt,
        truncation=True,
        max_length=512
    )
    label_map = {"yes": 1, "no": 0} 
    inputs["labels"] = label_map[example["final_decision"]]
    return inputs

def train_collator(batch):
    input_ids = pad_sequence(
        [torch.tensor(x['input_ids']) for x in batch],
        batch_first=True,
        padding_value=tokenizer.pad_token_id
    )
    attention_mask = pad_sequence(
        [torch.tensor(x['attention_mask']) for x in batch],
        batch_first=True,
        padding_value=0
    )
    labels = torch.tensor([x['labels'] for x in batch])
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [19]:
tokenized_dataset = val_dataset.map(
    preprocess_function,
    remove_columns=val_dataset.column_names  # Change to dataset.column_names
)

val_dataloader = DataLoader(
    tokenized_dataset,  # Change to tokenized_val_dataset
    batch_size=16,
    collate_fn=train_collator
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [20]:
def evaluate_model(model, val_loader, device="cuda"):
    model.eval()
    all_preds = []
    all_labels = []
    model.to(device)
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float().unsqueeze(1)  # shape: [batch_size, 1]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # shape: [batch_size, 1]

            probs = torch.sigmoid(logits)  # convert logits to probabilities
            preds = (probs > 0.5).long()   # thresholding

            all_preds.extend(preds.cpu().numpy().flatten())   # flatten for sklearn
            all_labels.extend(labels.cpu().numpy().flatten())

    return np.array(all_preds), np.array(all_labels)

In [21]:
all_preds, all_labels = evaluate_model(classifer_model, val_dataloader, device='cuda')

Evaluating: 100%|██████████| 125/125 [00:02<00:00, 41.73it/s]


In [24]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 69.55%
