In [4]:
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datasets import Dataset
from rouge_score import rouge_scorer
import textstat
import requests
import time
import os
import re
import tiktoken

In [None]:
# Set up your API key (replace with your actual API key)
GROQ_API_KEY = ""

HF_TOKEN = ""

In [6]:
# Configuration
MODEL_NAME = "llama-3.1-8b-instant"

In [7]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set paths
base = '/content/drive/MyDrive/BQA'
test_file_path = f'{base}/Test.json'
validation_file_path = f'{base}/Validation.json'

Using device: cpu
Mounted at /content/drive


In [9]:
# Load dataset function
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Load datasets
test_data = load_dataset(test_file_path)
validation_data = load_dataset(validation_file_path)

print(f"Test data loaded: {len(test_data['data'])} samples")
print(f"Validation data loaded: {len(validation_data['data'])} samples")

Test data loaded: 300 samples
Validation data loaded: 300 samples


In [10]:
# Extract contexts, questions, and answers from the data structure
def extract_qa_pairs(data):
    contexts = []
    questions = []
    answers = []
    question_types = []
    answer_types = []
    is_answerable = []

    for article in data['data']:
        context = article['context']
        for qa in article['qas']:
            contexts.append(context)
            questions.append(qa['question_text'])
            if qa['answers']['answer_text'] and len(qa['answers']['answer_text']) > 0:
                answers.append(qa['answers']['answer_text'][0])
            else:
                answers.append("")
            question_types.append(qa.get('question_type', 'factoid'))
            answer_types.append(qa['answers'].get('answer_type', 'single span'))
            is_answerable.append(len(qa['answers']['answer_text']) > 0 if qa['answers']['answer_text'] else False)

    return contexts, questions, answers, question_types, answer_types, is_answerable

In [11]:
# Extract data
test_contexts, test_questions, test_answers, test_question_types, test_answer_types, test_is_answerable = extract_qa_pairs(test_data)
val_contexts, val_questions, val_answers, val_question_types, val_answer_types, val_is_answerable = extract_qa_pairs(validation_data)

print(f"Extracted {len(test_questions)} test questions")
print(f"Extracted {len(val_questions)} validation questions")

Extracted 1493 test questions
Extracted 1484 validation questions


In [12]:
# Debug: Show sample data
print("\n=== SAMPLE DATA ===")
for i in range(min(3, len(test_questions))):
    print(f"Sample {i+1}:")
    print(f"Context: {test_contexts[i][:100]}...")
    print(f"Question: {test_questions[i]}")
    print(f"Answer: '{test_answers[i]}'")
    print(f"Answer type: {test_answer_types[i]}")
    print(f"Is answerable: {test_is_answerable[i]}")
    print()


=== SAMPLE DATA ===
Sample 1:
Context: অক্ষয় তৃতীয়া হল চান্দ্র বৈশাখ মাসের শুক্লাতৃতীয়া অর্থাৎ শুক্লপক্ষের তৃতীয়া তিথি। হিন্দু ও জৈন ধর...
Question: অক্ষয় তৃতীয়া কী ?
Answer: 'চান্দ্র বৈশাখ মাসের শুক্লাতৃতীয়া অর্থাৎ শুক্লপক্ষের তৃতীয়া তিথি'
Answer type: ['single span', 'single span']
Is answerable: True

Sample 2:
Context: অক্ষয় তৃতীয়া হল চান্দ্র বৈশাখ মাসের শুক্লাতৃতীয়া অর্থাৎ শুক্লপক্ষের তৃতীয়া তিথি। হিন্দু ও জৈন ধর...
Question: এই শুভদিনে কে জন্ম নিয়েছিলেন ?
Answer: 'বিষ্ণুর ষষ্ঠ অবতার পরশুরাম'
Answer type: ['single span', 'single span']
Is answerable: True

Sample 3:
Context: অক্ষয় তৃতীয়া হল চান্দ্র বৈশাখ মাসের শুক্লাতৃতীয়া অর্থাৎ শুক্লপক্ষের তৃতীয়া তিথি। হিন্দু ও জৈন ধর...
Question: এই দিনে মহাভারত রচনা আরম্ভ করেন কারা ?
Answer: 'বেদব্যাস ও গণেশ'
Answer type: ['single span', 'single span']
Is answerable: True



In [13]:
# Token counting function
def count_tokens(text):
    """Count tokens using tiktoken for llama-3"""
    try:
        encoding = tiktoken.get_encoding("cl100k_base")
        return len(encoding.encode(text))
    except:
        # Fallback: approximate token count
        return len(text.split()) * 1.3

In [14]:
# Normalization function
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [15]:
# Evaluation metrics
def compute_exact_match(prediction, truth):
    if not prediction or not truth:
        return 0
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    if not prediction or not truth:
        return 0

    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
    common_tokens = sum(common_tokens.values())

    if common_tokens == 0:
        return 0

    prec = 1.0 * common_tokens / len(pred_tokens)
    rec = 1.0 * common_tokens / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

In [16]:
def get_updated_f1(prediction, truth):
    if not prediction or not truth:
        return 0

    splitted_prediction = [p.strip() for p in prediction.split(';') if p.strip()]
    splitted_truth = [t.strip() for t in truth.split(';') if t.strip()]

    if len(splitted_truth) == 0 or len(splitted_prediction) == 0:
        return 0

    scores = np.zeros([len(splitted_truth), len(splitted_prediction)])
    for gold_index, gold_item in enumerate(splitted_truth):
        for pred_index, pred_item in enumerate(splitted_prediction):
            scores[gold_index, pred_index] = compute_f1(pred_item, gold_item)

    max_scores = np.zeros(max(len(splitted_truth), len(splitted_prediction)))
    for i in range(min(len(splitted_truth), len(splitted_prediction))):
        max_scores[i] = scores[i, i]

    return np.mean(max_scores) if len(max_scores) > 0 else 0

In [None]:
# Improved API call with rate limiting and retries
def call_groq_api(prompt, max_tokens=256, max_retries=5):
    """Call Groq API with rate limiting and retries"""
    url = " "

    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.1,
        "max_tokens": max_tokens,
        "top_p": 0.9
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, json=payload, timeout=30)

            if response.status_code == 429:
                # Rate limit exceeded - wait and retry
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Rate limit hit. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue

            response.raise_for_status()
            result = response.json()
            answer = result['choices'][0]['message']['content'].strip()

            # Clean up the response
            answer = re.sub(r'^Answer:\s*', '', answer)
            answer = re.sub(r'\"', '', answer)
            answer = answer.strip()

            return answer

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                return ""
            time.sleep(2 ** attempt)  # Exponential backoff

    return ""

def get_llm_answer(context, question):
    """Get answer from LLM with optimized prompt"""
    # Optimized prompt for better performance
    few_shot_examples = """\
প্রসঙ্গ থেকে প্রশ্নের উত্তর দিন। উত্তর শুধুমাত্র প্রসঙ্গ থেকে নিতে হবে।

উদাহরণ:
প্রসঙ্গ: বাংলাদেশের রাজধানী ঢাকা। ঢাকা একটি বড় শহর।
প্রশ্ন: বাংলাদেশের রাজধানী কোথায়?
উত্তর: ঢাকা

প্রসঙ্গ: সূর্য পূর্বে উঠে পশ্চিমে অস্ত যায়।
প্রশ্ন: সূর্য কোথায় অস্ত যায়?
উত্তর: পশ্চিমে

প্রসঙ্গ: বাঘ জঙ্গলে বাস করে।
প্রশ্ন: হাতি কোথায় বাস করে?
উত্তর: উত্তর পাওয়া যায়নি

এখন উত্তর দিন:
"""

    prompt = f"""{few_shot_examples}
প্রসঙ্গ: {context}
প্রশ্ন: {question}
উত্তর: """

    # Count tokens and add delay based on token count
    token_count = count_tokens(prompt)
    if token_count > 1000:
        time.sleep(2)  # Longer delay for larger prompts

    return call_groq_api(prompt)

In [18]:
# Evaluation function with rate limiting
def evaluate_model(contexts, questions, answers, answer_types, sample_limit=5):
    """Evaluate the model with proper rate limiting"""
    results = {
        'predictions': [],
        'em_scores': [],
        'f1_scores': [],
        'readability_scores': []
    }

    # Process each sample with careful rate limiting
    for i in tqdm(range(min(sample_limit, len(contexts)))):
        context = contexts[i]
        question = questions[i]
        true_answer = answers[i]

        # Get prediction from LLM
        prediction = get_llm_answer(context, question)

        # Handle unanswerable questions
        if "উত্তর পাওয়া যায়নি" in prediction or not prediction.strip():
            prediction = ""

        # Calculate metrics
        em = compute_exact_match(prediction, true_answer)

        if answer_types[i] == 'multiple spans':
            f1 = get_updated_f1(prediction, true_answer)
        else:
            f1 = compute_f1(prediction, true_answer)

        # Calculate readability score
        readability = textstat.flesch_reading_ease(prediction) if prediction else 0

        # Store results
        results['predictions'].append(prediction)
        results['em_scores'].append(em)
        results['f1_scores'].append(f1)
        results['readability_scores'].append(readability)

        # Add strategic delay to avoid rate limits
        if i % 2 == 0:  # Every 2nd request
            time.sleep(3)
        else:
            time.sleep(1)

    return results

In [19]:
# Run evaluation on a smaller subset to avoid rate limits
print("Starting evaluation on test set (small subset)...")
test_results = evaluate_model(test_contexts, test_questions, test_answers, test_answer_types, sample_limit=5)

Starting evaluation on test set (small subset)...


100%|██████████| 5/5 [00:29<00:00,  5.98s/it]


In [22]:
# Create detailed results dataframe
detailed_df = pd.DataFrame({
    'Context': test_contexts[:len(test_results['predictions'])],
    'Question': test_questions[:len(test_results['predictions'])],
    'True_Answer': test_answers[:len(test_results['predictions'])],
    'Prediction': test_results['predictions'],
    'EM_Score': test_results['em_scores'],
    'F1_Score': test_results['f1_scores'],
    'Readability': test_results['readability_scores']
})

In [23]:
# Save results
detailed_df.to_csv('llama_banglarqa_test_results.csv', index=False, encoding='utf-8')
print("\nDetailed results saved to 'llama_banglarqa_test_results.csv'")


Detailed results saved to 'llama_banglarqa_test_results.csv'


In [None]:
# Performance analysis
if overall_em > 0 and overall_f1 > 0:
    improvement_em = ((overall_em - banglat5_results['EM']) / banglat5_results['EM']) * 100
    improvement_f1 = ((overall_f1 - banglat5_results['F1']) / banglat5_results['F1']) * 100

    print(f"\n=== PERFORMANCE IMPROVEMENT ===")
    print(f"EM Score Improvement: {improvement_em:.2f}%")
    print(f"F1 Score Improvement: {improvement_f1:.2f}%")

    if improvement_em > 0 and improvement_f1 > 0:
        print("SUCCESS: Llama-3.1-8B outperforms BanglaT5!")
    else:
        print("Llama-3.1-8B needs further optimization")
else:
    print("\n No valid scores obtained. Please check API configuration.")