In [None]:
############################################
# Team : RAGrats
# Team Members : Ali Asgar Padaria, Param Patel, Meet Zalavadiya
# 
# Code Description : This file contains the code for the Improvement 1 Model
#                    It implements the "Debating Model" concept. Two smaller models (Flan-t5-large & Qwen3-0.6B) than the basline 1 (llama 3 1B) are used as generators 
#                    for the explainaiton of the answer, these models debate till they reach a consensus or hit a roadblock of maximum debates iterations.
#
# NLP Concepts Usage: Tokenization, Embeddings (via retriever in vectorDB_generator.py), Language Modeling, Question Answering
#
# System : GCP Server L4 GPU
#############################################

In [None]:
# Import Necessary Libraries

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, load_from_disk
from huggingface_hub import login
from tqdm import tqdm
from dotenv import load_dotenv
import os
load_dotenv()
token = os.getenv("HF_TOKEN")

login(token=token)
import torch.nn.functional as F

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np
from transformers import pipeline

from transformers import (
    RobertaTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM
)

from sklearn.metrics import accuracy_score
import json

import faiss

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
assert torch.cuda.is_available(), "GPU not available!" # check GPU

In [3]:
# Loading Dataset Stuff

# The retreived pairs for the validation set have already been created in the vectorDB_generator.ipynb file in the same folder

validation_dataset = load_from_disk("../files/val_dataset")

with open("../files/val_retrieved_pairs_base_1.json", "r") as f:
    retreived_pairs = json.load(f)

# dataset # holds final dataset
context_map = {item["question"]: item["retrieved_contexts"] for item in retreived_pairs}

def add_retrieved_contexts(example):
    example["retrieved_contexts"] = context_map[example["question"]]
    return example

validation_dataset = validation_dataset.map(add_retrieved_contexts)

In [None]:
# Load the model,
# For the debating model architecture, we use the T5 model

model_id_flan = "google/flan-t5-large"

tokenizer_flan = AutoTokenizer.from_pretrained(model_id_flan)
model_flan = AutoModelForSeq2SeqLM.from_pretrained(
    model_id_flan,
    torch_dtype=torch.float16,
    device_map="auto"  # loads onto GPU automatically
)

pipe_flan = pipeline(
    "text2text-generation",
    model=model_flan,
    tokenizer=tokenizer_flan,
)

model_id_opt = "Qwen/Qwen3-0.6B"

tokenizer_opt = AutoTokenizer.from_pretrained(model_id_opt)
model_opt = AutoModelForCausalLM.from_pretrained( # NLP Concept : Language Modeling
    model_id_opt,
    device_map="auto",
)

pipe_opt = pipeline(
    "text-generation",
    model=model_opt,
    tokenizer=tokenizer_opt,
    pad_token_id=tokenizer_opt.eos_token_id 
)

Device set to use cuda:0
Device set to use cuda:0


In [12]:
# generate base prompts
def generate_prompts(dataset):
    prompts = []

    for data in dataset:
        question = data['question']
        contexts = data['retrieved_contexts']
        context = "\n".join([f"Context {i+1}: {c}" for i, c in enumerate(contexts)])
        prompt = (
            f"Contexts:\n{context}\n\n"
            "Based on the contexts above, answer the question below in yes or no.\n"
            "Provide a short explanation that justifies your answer using evidence from the context.\n"
            f"Question: {question}?\n"
            "Answer: "
        )
        prompts.append(prompt)

    
    return prompts

prompts = generate_prompts(validation_dataset)

In [None]:
# Initialize the Roberta model for classification

tokenizer = AutoTokenizer.from_pretrained("roberta-base") # NLP Concept : Tokenization and Embeddings

# Define label mapping
label_map = {"yes": 1, "no": 0}

classifer_model = AutoModelForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=1,
)
classifer_model.load_state_dict(torch.load("../files/roberta_classifier.pt", map_location=torch.device("cuda")))  # or "cuda"
classifer_model.eval()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  classifer_model.load_state_dict(torch.load("../files/roberta_classifier.pt", map_location=torch.device("cuda")))  # or "cuda"


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
def get_label(input_prompt):
    with torch.no_grad():
        # move the input to GPU
        with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
            inputs = tokenizer(
                input_prompt,
                truncation=True,
                max_length=1024,
                return_tensors="pt"
            )
        # inputs = {k: torch.tensor(v).unsqueeze(0).to("cuda") for k, v in inputs.items()}
            outputs = classifer_model(**inputs)
            logits = outputs.logits
            probs = torch.sigmoid(logits)  # convert logits to probabilities
            pred = (probs > 0.5).long()

        return int(pred.cpu().squeeze().numpy())

def generate_answer(prompt, pipe):

    if pipe == pipe_flan:
        generated = pipe(prompt, max_length=150)
    else:
        generated = pipe(prompt, max_new_tokens=150, return_full_text=False, do_sample=True, temperature=0.7, top_p=0.9, top_k=50)
    answer = generated[0]['generated_text']
    return answer

In [None]:
def generate(dataset, start_id = 0, end_id =100,  max_iter = 3):
    all_preds = []
    all_labels = []
    iteration_counter = 0
    item_counter = 0
    flag_encountered = 0
    for data in tqdm(dataset.select(range(start_id, end_id))):
        item_counter += 1
        question = data['question']
        contexts = data['retrieved_contexts']
        context = "\n".join([f"Context {i+1}: {c}" for i, c in enumerate(contexts)])
        prompt = (
            f"Contexts:\n{context}\n\n"
            "Based on the contexts above, answer the question below in yes, no, maybe.\n"
            "Provide a short explanation that justifies your answer using evidence from the context.\n"
            f"Question: {question}?\n"
            "Answer: "
        )
        all_labels.append(label_map[data['final_decision']])
        flag = 0
        prompt_1 = prompt
        prompt_2 = prompt
        for _ in range(max_iter):
            iteration_counter += 1
            # First model's answer
            answer_1 = generate_answer(prompt_1, pipe_flan)
            # Second model's answer
            answer_2 = generate_answer(prompt_2, pipe_opt)
            

            # compare the RoBERTa labels of both models
            label_1 = get_label(f"{question}\n{answer_1}")
            label_2 = get_label(f"{question}\n{answer_2}")

            if label_1 == label_2:
                all_preds.append(label_1)
                flag = 1
                break
            
            ######
            # Debate Mechanism | when labels do not match
            prompt_1 += f"\n\nModel 2 said:\n{answer_2}\n\nDo you want to change your answer?"
            prompt_2 += f"\n\nModel 1 said:\n{answer_1}\n\nDo you want to change your answer?"
            ######
        if flag == 0:
            # default mode
            flag_encountered += 1
            final_label = get_label(f"{question}\n{answer_1}\n{answer_2}")
            all_preds.append(final_label)
    
    return all_preds, all_labels, iteration_counter, item_counter, flag_encountered



In [39]:
all_preds = []
all_labels = []
iteration_counter = 0
item_counter = 0
flag_encountered = 0

In [None]:
# with open(f"validation_results_.json", "w") as f:
#     json.dump({"check":"check"}, f)

In [40]:
for i in range(0,500, 100):
    start_id = i
    end_id = i + 100
    preds, labels, iteration_counter_, item_counter_, flag_encountered_ = generate(validation_dataset, start_id = start_id, end_id=end_id, max_iter=3)
    all_preds.extend(preds)
    all_labels.extend(labels) 
    iteration_counter += iteration_counter_
    item_counter += item_counter_
    flag_encountered += flag_encountered_

    # save all stuff in a json file
    with open(f"validation_results_.json", "w") as f:
        json.dump({
            "all_preds": all_preds,
            "all_labels": all_labels,
            "iteration_counter": iteration_counter,
            "item_counter": item_counter,
            "flag_encountered": flag_encountered
        }, f)


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [22:15<00:00, 13.35s/it]
100%|██████████| 100/100 [22:11<00:00, 13.31s/it]
100%|██████████| 100/100 [21:01<00:00, 12.62s/it]
100%|██████████| 100/100 [20:39<00:00, 12.39s/it]
100%|██████████| 100/100 [19:56<00:00, 11.96s/it]


In [42]:
print("Average Debate Iterations :", iteration_counter/500)

Average Debate Iterations : 1.55


Average Iterations per prompt were 1.55 which shows models did disagree on certain answers and they debated till they reached some sort of concensus

In [43]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 69.80%


Accuracy Achieved is 69.80%, which is around 3% lower than the original larger model. We believe it is because the smaller models even though they debate are not made for this task
If we can scale up both the baselines and improved models, in an example case using 8B model for baseline and using 1B or 3B Llama models as improved debating consensus might very well outperform the 8B model but we were not able to try that due to resourse and time constrains these large models have.