In [None]:
# Login to Hugging Face
from huggingface_hub import login
# login()

In [None]:
# Install required packages
!pip install transformers torch accelerate sentencepiece pandas numpy datasets
!pip install huggingface_hub

# ***Libraries***

In [62]:
import json
import torch
import requests
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import Dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import time
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set paths
base = '/content/drive/MyDrive/BQA'
test_file_path = f'{base}/Test.json'
validation_file_path = f'{base}/Validation.json'


Using device: cpu
Mounted at /content/drive


# ***Load Dataset***

In [68]:
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

test_data = load_dataset(test_file_path)
validation_data = load_dataset(validation_file_path)

print(f"Test data loaded: {len(test_data)} samples")
print(f"Validation data loaded: {len(validation_data)} samples")


Test data loaded: 1 samples
Validation data loaded: 1 samples


In [None]:
#API Keys (Groq and Openrouter)

API_PROVIDERS = {
    "groq": {
        "url": " ",
        "model": "llama-3.1-8b-instant",
        "headers": {"Content-Type": "application/json"}
    },
    "openrouter": {
        "url": " ",
        "model": "meta-llama/llama-3.1-8b-instant",
        "headers": {"Content-Type": "application/json"}
    }
}


API_KEYS = {
    "groq": "",
    "openrouter": ""
}
current_provider = "groq"

# ***Model Responses Functions***

In [72]:
def get_llama_response(prompt, max_retries=5):
    """Get response from Llama 3.1 with multiple fallback providers"""
    global current_provider

    providers_tried = set()

    for attempt in range(max_retries):
        provider = current_provider
        if provider in providers_tried:
            # Try next provider
            available_providers = [p for p in API_PROVIDERS.keys() if p not in providers_tried]
            if available_providers:
                provider = available_providers[0]
            else:
                break

        providers_tried.add(provider)

        config = API_PROVIDERS[provider]
        api_key = API_KEYS[provider]

        # Skip if no API key
        if api_key == f"your_{provider}_key_here" or not api_key:
            print(f"Skipping {provider} - no API key provided")
            continue

        headers = {
            "Authorization": f"Bearer {api_key}",
            **config["headers"]
        }

        if provider == "openrouter":
            headers["HTTP-Referer"] = "https://colab.research.google.com"
            headers["X-Title"] = "BanglaRQA Evaluation"

        payload = {
            "model": config["model"],
            "messages": [
                {
                    "role": "system",
                    "content": """You are a expert Bangla question answering assistant. Answer questions based ONLY on the given context.
                    Rules:
                    1. Answer in concise Bangla
                    2. If the question cannot be answered from context, say "উত্তর দেওয়া সম্ভব নয়"
                    3. For list questions, provide all items separated by commas
                    4. For yes/no questions, answer only "হ্যাঁ" or "না\""""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "temperature": 0.1,
            "max_tokens": 150,
            "top_p": 0.9
        }

        try:
            response = requests.post(config["url"], headers=headers, json=payload, timeout=45)

            if response.status_code == 429:
                # Rate limited - switch provider and wait
                wait_time = min(30, 2 ** attempt)  # Exponential backoff
                print(f"Rate limited on {provider}. Waiting {wait_time}s, then trying next provider...")
                time.sleep(wait_time)
                continue

            response.raise_for_status()
            result = response.json()

            answer = result['choices'][0]['message']['content'].strip()

            # If we got a successful response, stick with this provider
            current_provider = provider
            return answer

        except requests.exceptions.RequestException as e:
            print(f"Error with {provider}: {e}")
            wait_time = min(60, 5 * (attempt + 1))
            time.sleep(wait_time)
            continue

    print("All providers failed. Using fallback empty response.")
    return "উত্তর দেওয়া সম্ভব নয়"

# ***Generating Answers***

In [None]:
def generate_answer(context, question):
    """Generate answer using Llama 3.1 with optimized prompt"""
    prompt = f"""প্রসঙ্গ: {context}

প্রশ্ন: {question}

উপরের প্রসঙ্গ文本টি পড়ুন এবং প্রশ্নের উত্তর দিন। উত্তরটি সংক্ষিপ্ত এবং সঠিক হতে হবে। যদি প্রসঙ্গ উত্তর না থাকে, "উত্তর দেওয়া সম্ভব নয়" লিখুন।"""

    try:
        answer = get_llama_response(prompt)

        # Clean and normalize the answer
        answer = re.sub(r'^(উত্তর|Answer|answer|ans|Ans)[:\s]*', '', answer, flags=re.IGNORECASE)
        answer = re.sub(r'[।\.]\s*$', '', answer)  # Remove trailing punctuation
        answer = answer.strip()

        return answer

    except Exception as e:
        print(f"Error generating answer: {e}")
        return "উত্তর দেওয়া সম্ভব নয়"

def normalize_text(text):
    """Normalize Bangla text"""
    if text is None:
        return ""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[।\.]\s*$', '', text)  # Remove trailing punctuation
    return text

def calculate_em(predicted, ground_truth):
    """Calculate Exact Match score"""
    predicted = normalize_text(predicted)
    ground_truth = normalize_text(ground_truth)

    # Handle unanswerable questions
    if not ground_truth and ("সম্ভব নয়" in predicted or not predicted.strip()):
        return 1
    if not predicted and not ground_truth:
        return 1

    return 1 if predicted == ground_truth else 0

def calculate_f1(predicted, ground_truth):
    """Calculate F1 score with better Bangla tokenization"""
    predicted = normalize_text(predicted)
    ground_truth = normalize_text(ground_truth)

    # Handle unanswerable cases
    if not ground_truth and ("সম্ভব নয়" in predicted or not predicted.strip()):
        return 1.0
    if not predicted and not ground_truth:
        return 1.0
    if not predicted or not ground_truth:
        return 0.0

    # Better tokenization for Bangla
    pred_tokens = re.findall(r'[\w\u0980-\u09FF]+|[^\w\s]', predicted)  # Bangla words + punctuation
    truth_tokens = re.findall(r'[\w\u0980-\u09FF]+|[^\w\s]', ground_truth)

    if not pred_tokens or not truth_tokens:
        return 0.0

    # Calculate precision and recall
    common_tokens = set(pred_tokens) & set(truth_tokens)
    precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
    recall = len(common_tokens) / len(truth_tokens) if truth_tokens else 0

    # Calculate F1
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

# **Evaluate Relevant Functions**

In [81]:
def evaluate_dataset(dataset, sample_size=None):
    """Evaluate model on the dataset"""
    results = []

    # Extract the actual data from the dictionary structure
    if isinstance(dataset, dict):
        # Try common keys that might contain the actual data
        data_keys = ['data', 'paragraphs', 'questions', 'items', 'samples']
        for key in data_keys:
            if key in dataset and isinstance(dataset[key], list):
                actual_data = dataset[key]
                break
        else:
            # If no common keys found, look for any list value
            for key, value in dataset.items():
                if isinstance(value, list):
                    actual_data = value
                    break
            else:
                raise ValueError("Could not find list data in dataset dictionary")
    else:
        actual_data = dataset

    # Apply sample size to the actual list data
    if sample_size and isinstance(actual_data, list):
        actual_data = actual_data[:sample_size]

    for i, item in enumerate(tqdm(actual_data, desc="Evaluating")):
        # Extract data from the item - handle different possible structures
        if isinstance(item, dict):
            context = item.get('context', '')
            question = item.get('question', '')
            ground_truth = item.get('answer', '') if item.get('is_answerable', True) else ''
            question_type = item.get('question_type', 'unknown')
            answer_type = item.get('answer_type', 'unknown')
            is_answerable = item.get('is_answerable', True)
        else:
            # If it's not a dictionary, use empty values
            context = ''
            question = ''
            ground_truth = ''
            question_type = 'unknown'
            answer_type = 'unknown'
            is_answerable = True

        # Generate answer
        predicted_answer = generate_answer(context, question)

        # Calculate scores
        em = calculate_em(predicted_answer, ground_truth)
        f1 = calculate_f1(predicted_answer, ground_truth)

        results.append({
            'context': context[:100] + "..." if len(context) > 100 else context,
            'question': question,
            'ground_truth': ground_truth,
            'predicted_answer': predicted_answer,
            'question_type': question_type,
            'answer_type': answer_type,
            'is_answerable': is_answerable,
            'em': em,
            'f1': f1
        })

        # Add delay to avoid rate limiting
        time.sleep(2)  # 2 seconds between requests

    return results

# **Results Funtions**

In [84]:
def analyze_results(results):
    """Analyze results by question type and answer type"""
    df = pd.DataFrame(results)

    # Overall scores
    overall_em = df['em'].mean() * 100
    overall_f1 = df['f1'].mean() * 100

    print(f"Overall EM: {overall_em:.2f}%")
    print(f"Overall F1: {overall_f1:.2f}%")
    print()


        # By question type
    print("Performance by Question Type:")
    question_types = ['factoid', 'causal', 'confirmation', 'list']
    for q_type in question_types:
        subset = df[df['question_type'] == q_type]
        if len(subset) > 0:
            em_score = subset['em'].mean() * 100
            f1_score = subset['f1'].mean() * 100
            count = len(subset)
            print(f"{q_type} (n={count}): EM={em_score:.2f}%, F1={f1_score:.2f}%")
    print()

    # By answer type
    print("Performance by Answer Type:")
    answer_types = ['single span', 'multiple spans', 'yes/no']
    for a_type in answer_types:
        subset = df[df['answer_type'] == a_type]
        if len(subset) > 0:
            em_score = subset['em'].mean() * 100
            f1_score = subset['f1'].mean() * 100
            count = len(subset)
            print(f"{a_type} (n={count}): EM={em_score:.2f}%, F1={f1_score:.2f}%")
    print()

    # Answerable vs Unanswerable
    print("Performance by Answerability:")
    for answerable in [True, False]:
        subset = df[df['is_answerable'] == answerable]
        status = "Answerable" if answerable else "Unanswerable"
        em_score = subset['em'].mean() * 100
        f1_score = subset['f1'].mean() * 100
        count = len(subset)
        print(f"{status} (n={count}): EM={em_score:.2f}%, F1={f1_score:.2f}%")

    return df

In [None]:
# Set your API keys before running
API_KEYS["groq"] = input("Enter your Groq API key: ") or API_KEYS["groq"]
API_KEYS["openrouter"] = input("Enter your OpenRouter API key (or press enter to skip): ") or API_KEYS["openrouter"]

# Test the API connection
print("Testing API connection...")
test_response = get_llama_response("হ্যালো, এটি একটি পরীক্ষা।")
print(f"API test response: {test_response}")


In [None]:
print("Dataset type:", type(validation_data))
if isinstance(validation_data, dict):
    print("Dictionary keys:", validation_data.keys())
    for key, value in validation_data.items():
        if isinstance(value, list):
            print(f"List '{key}' has {len(value)} items")
            if len(value) > 0:
                print("First item type:", type(value[0]))
                if isinstance(value[0], dict):
                    print("First item keys:", value[0].keys())

# **Evaluations for Answerable/Unanswerable Questions**

In [None]:
# Evaluate with careful rate limiting
print("\nEvaluating on Validation set...")
validation_results = evaluate_dataset(validation_data, sample_size=10)  # Start small
validation_df = analyze_results(validation_results)

print("\nEvaluating on Test set...")
test_results = evaluate_dataset(test_data, sample_size=15)
test_df = analyze_results(test_results)

In [88]:
# Save results
def save_results(results, filename):
    with open(f'{base}/{filename}', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

save_results(validation_results, 'llama31_validation_results.json')
save_results(test_results, 'llama31_test_results.json')

print("\nEvaluation completed! Results saved to Google Drive.")


Evaluation completed! Results saved to Google Drive.
