In [None]:
!pip install --upgrade datasets

import json
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import ast
import nltk
import re

from collections import defaultdict
from nltk.corpus import stopwords
from huggingface_hub import login
from datasets import Dataset, DatasetDict, load_dataset
from tqdm import tqdm
from itertools import combinations

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


In [None]:
dataset = load_dataset("Ramitha/alqa-slsc-newsqa-40-ig")
df = pd.DataFrame(dataset['rawcases'])

In [None]:
token_cut_off_ratio = 0.33

In [None]:
def top_k_words(token_probs, original_text):
    if isinstance(token_probs, str):
        import ast
        token_probs = ast.literal_eval(token_probs)
    word_probs = defaultdict(float)
    current_word = ''
    current_sum = 0.0
    for pair in token_probs:
        if not isinstance(pair, (list, tuple)) or len(pair) != 2:
            continue
        token, prob = pair
        cleaned = re.sub(r'[^a-zA-Z0-9]', '', token)
        if not cleaned:
            continue
        if token.startswith('\u2581') or token.startswith('▁') or token.startswith('\u0120'):
            if current_word:
                word_probs[current_word.lower()] += current_sum
            current_word = cleaned
            current_sum = prob
        else:
            current_word += cleaned
            current_sum += prob
    if current_word:
        word_probs[current_word.lower()] += current_sum
    text_words = set(w.lower() for w in re.findall(r'\w+', original_text))
    filtered_probs = {w: p for w, p in word_probs.items() if w in text_words and w not in stop_words}
    sorted_words = sorted(filtered_probs.items(), key=lambda x: x[1], reverse=True)
    return sorted_words[:(int(len(sorted_words) * token_cut_off_ratio) + 1)]

def intersection_score(*lists):
    lists = [[w if not isinstance(w, tuple) else w[0] for w in lst] for lst in lists if lst]
    if len(lists) < 2:
        return 0.0
    sets = [set(lst) for lst in lists]
    common = set.intersection(*sets)
    union = set.union(*sets)
    denom = len(union)
    return len(common) / denom if denom else 0.0

def unique_word_count(original_text):
    stop_words = set(stopwords.words("english"))
    words = re.findall(r"\w+", original_text.lower())
    filtered_words = [w for w in words if w not in stop_words]
    filtered_words = words
    unique_words = set(filtered_words)
    return len(unique_words)

RAW RESULTS

In [None]:
model_cols = ['question_raw_ig_tokens_llama',
              'question_raw_ig_tokens_falcon',
              'question_raw_ig_tokens_gemma',
              'question_raw_ig_tokens_mistral']

for idx, row in tqdm(df.iterrows(), total=len(df)):
    df.at[idx, 'question_iaa_all'] = intersection_score(
        *[top_k_words(row[col], row['question']) for col in model_cols]
    )
    for col1, col2 in combinations(model_cols, 2):
        col_name = f'question_iaa_{col1.split("_")[-1]}_{col2.split("_")[-1]}'
        df.at[idx, col_name] = intersection_score(
            top_k_words(row[col1], row['question']),
            top_k_words(row[col2], row['question'])
        )
    df.at[idx, 'answer_iaa_all'] = intersection_score(
        *[top_k_words(row[col], row['answer']) for col in model_cols]
    )
    for col1, col2 in combinations(model_cols, 2):
        col_name = f'answer_iaa_{col1.split("_")[-1]}_{col2.split("_")[-1]}'
        df.at[idx, col_name] = intersection_score(
            top_k_words(row[col1], row['answer']),
            top_k_words(row[col2], row['answer'])
        )
    df.at[idx, 'question_unique_words'] = unique_word_count(row['question'])
    df.at[idx, 'answer_unique_words'] = unique_word_count(row['answer'])

100%|██████████| 120/120 [00:01<00:00, 104.86it/s]


In [None]:
average_cols = [c for c in df.columns if c.startswith('question_iaa_')]
dataset_averages = (
    df.groupby('dataset')[average_cols]
      .mean()
      .round(4)
      .reset_index()
)
dataset_averages

Unnamed: 0,dataset,question_iaa_all,question_iaa_llama_falcon,question_iaa_llama_gemma,question_iaa_llama_mistral,question_iaa_falcon_gemma,question_iaa_falcon_mistral,question_iaa_gemma_mistral
0,alqa,0.0956,0.2596,0.2754,0.5442,0.406,0.3274,0.3203
1,newsqa,0.2396,0.5467,0.4687,0.5758,0.5146,0.5258,0.5133
2,slsc,0.0911,0.3164,0.3658,0.4452,0.2847,0.334,0.2625


In [None]:
average_cols = [c for c in df.columns if c.startswith('answer_iaa_')]
dataset_averages = (
    df.groupby('dataset')[average_cols]
      .mean()
      .round(4)
      .reset_index()
)
dataset_averages

Unnamed: 0,dataset,answer_iaa_all,answer_iaa_llama_falcon,answer_iaa_llama_gemma,answer_iaa_llama_mistral,answer_iaa_falcon_gemma,answer_iaa_falcon_mistral,answer_iaa_gemma_mistral
0,alqa,0.1403,0.2899,0.3454,0.5744,0.4439,0.3802,0.3973
1,newsqa,0.2562,0.5667,0.4708,0.5217,0.5375,0.4633,0.5217
2,slsc,0.0879,0.3197,0.3896,0.4393,0.2492,0.2923,0.3596
