In [None]:
############################################
# Team : RAGrats
# Team Members : Ali Asgar Padaria, Param Patel, Meet Zalavadiya
# 
# Code Description : This file contains the code for the Baseline 1 Model
#                    It implements a RAG system which tries QA (Boolean) on PubMedQA dataset via a Llama 3.2 - 1B model.
#                    It acts as a baseline for "Debating (smaller) Models" implemented in improvement_1.py file.
#                    For this file, we are directly using the retrieved documents that we compiled and saved while generating the vectorDB for baseline 1 and improvement 1 models.
#                    We do not retrieve the documents again from the vectorDB. [That code is available in vectorDB_generator.py file, which also stores the retrieved contexts mapping to the questions]
#
# NLP Concepts Usage: Tokenization, Embeddings (via retriever in vectorDB_generator.py), Language Modeling, Question Answering
#
# System : GCP Server L4 GPU
#############################################

In [None]:
# Import Necessary Libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, load_from_disk
from huggingface_hub import login
from tqdm import tqdm
from dotenv import load_dotenv
import os
load_dotenv()
token = os.getenv("HF_TOKEN") # Login to Hugging Face Hub for Access to LLama models

login(token=token)
import torch.nn.functional as F

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import pipeline

from transformers import (
    RobertaTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification
)
from sklearn.metrics import accuracy_score
import json


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
assert torch.cuda.is_available(), "GPU not available!" # check if GPU is available

In [None]:
# Loading Dataset Stuff

# The retreived pairs for the validation set have already been created in the vectorDB_generator.ipynb file in the same folder
validation_dataset = load_from_disk("../files/val_dataset")

with open("../files/val_retrieved_pairs_base_1.json", "r") as f:
    retreived_pairs = json.load(f)

# dataset # holds final dataset
context_map = {item["question"]: item["retrieved_contexts"] for item in retreived_pairs}

def add_retrieved_contexts(example):
    example["retrieved_contexts"] = context_map[example["question"]]
    return example

validation_dataset = validation_dataset.map(add_retrieved_contexts)

In [None]:
# Load LLama 1B model
model_3b = 'meta-llama/Llama-3.2-1B'

model_id = model_3b

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, torch_dtype=torch.float16) 

In [None]:
# loading the model
model = AutoModelForCausalLM.from_pretrained( # NLP Concept: Language Modeling
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer.pad_token = tokenizer.eos_token  # Required
model.config.pad_token_id = tokenizer.pad_token_id

print("Tokenizer and model loaded successfully.")

Tokenizer and model loaded successfully.


In [None]:
# Explaination Generation Model (LLama 1B)
def generate_prompts(dataset):
    prompts = []

    for data in dataset:
        question = data['question']
        contexts = data['retrieved_contexts']
        context = "\n".join([f"Context {i+1}: {c}" for i, c in enumerate(contexts)])
        prompt = (
            f"Contexts:\n{context}\n\n"
            "Based on the contexts above, answer the question below in yes, no, maybe.\n"
            "Provide a short explanation that justifies your answer using evidence from the context.\n"
            f"Question: {question}?\n"
            "Answer: "
        )
        prompts.append(prompt)

    
    return prompts

In [11]:
prompts = generate_prompts(validation_dataset.select(range(500)))

In [None]:
# Generate explanations using the model in batched format
def generate_batch(prompts, max_new_tokens=150):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        with torch.amp.autocast(dtype=torch.float16, device_type="cuda"):
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                return_dict_in_generate=True,
                output_scores=False
            )

    decoded = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
    return decoded

def generate(prompts, batch_size=8):
    all_outputs = []
    for i in tqdm(range(0, len(prompts), batch_size)):
        batch = prompts[i:i+batch_size]
        outputs = generate_batch(batch)
        all_outputs.extend(outputs)
    return all_outputs

In [16]:
all_outputs = generate(prompts, batch_size=8)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 1/63 [00:05<05:19,  5.16s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 2/63 [00:09<04:37,  4.55s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▍         | 3/63 [00:13<04:31,  4.52s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▋         | 4/63 [00:17<04:14,  4.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 5/63 [00:21<04:06,  4.25s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|▉         | 6/63 [00:25<03:59,  4.20s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 11%|█         | 7/63 [00:30<03:59,  4.28s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 13%|█▎        | 8/63 [00:34<03:52,  4.23s/it]Setting `pad_token_id` to `eos_token_id`:128001 

In [None]:
generated_texts = []

for prompt, output in zip(prompts, all_outputs):
    # Extract the generated answer from the output without the user prompt
    generated_texts.append(output.split("Answer:")[-1].strip())

In [None]:
# save generated texts (only for easy access so that I do not have to run the model again)
with open("../files/llama_3b_explanations.json", "w") as f:
    json.dump(generated_texts, f)

In [None]:
validation_dataset = validation_dataset.select(range(len(generated_texts)))
validation_dataset = validation_dataset.add_column("generated_answer", generated_texts) # add generated answer to the dataset


In [20]:
validation_dataset

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'retrieved_contexts', 'generated_answer'],
    num_rows: 500
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base") # NLP Concept: Tokenization

# Define label mapping
label_map = {"yes": 1, "no": 0}

classifer_model = AutoModelForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=1,
)
classifer_model.load_state_dict(torch.load("../files/roberta_classifier.pt", map_location=torch.device("cuda")))  # load the model weights for roberta classifier
classifer_model.eval()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  classifer_model.load_state_dict(torch.load("../files/roberta_classifier.pt", map_location=torch.device("cuda")))  # or "cuda"


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
def preprocess_function(example):
    # Use only questions, no context
    
    final_prompt = f"{example['question']}\n{example['generated_answer']}"
    inputs = tokenizer( # NLP Concept: Tokenization
        final_prompt,
        truncation=True,
        max_length=512
    )
    label_map = {"yes": 1, "no": 0} 
    inputs["labels"] = label_map[example["final_decision"]]
    return inputs

def train_collator(batch):
    input_ids = pad_sequence(
        [torch.tensor(x['input_ids']) for x in batch],
        batch_first=True,
        padding_value=tokenizer.pad_token_id
    )
    attention_mask = pad_sequence(
        [torch.tensor(x['attention_mask']) for x in batch],
        batch_first=True,
        padding_value=0
    )
    labels = torch.tensor([x['labels'] for x in batch])
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [None]:
tokenized_dataset = validation_dataset.map(
    preprocess_function,
    remove_columns=validation_dataset.column_names  
)

val_dataloader = DataLoader(
    tokenized_dataset,  
    batch_size=16,
    collate_fn=train_collator
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
def evaluate_model(model, val_loader, device="cuda"):
    model.eval()
    all_preds = []
    all_labels = []
    model.to(device)
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float().unsqueeze(1)  # shape: [batch_size, 1]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # shape: [batch_size, 1]

            probs = torch.sigmoid(logits)  # convert logits to probabilities
            preds = (probs > 0.5).long()   # thresholding  

            all_preds.extend(preds.cpu().numpy().flatten())   # flatten for sklearn
            all_labels.extend(labels.cpu().numpy().flatten())

    return np.array(all_preds), np.array(all_labels)

In [None]:
all_preds, all_labels = evaluate_model(classifer_model, val_dataloader, device='cuda') # NLP Concept: Language Modeling, Question Answering

Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.49it/s]


In [26]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 73.40%


In [None]:
# Accuracy of Baseline 1 using Llama 3 1B model: 73.40% 