In [1]:
import json
import pandas as pd 
import numpy as np
import os
import re
from sklearn.metrics import mutual_info_score
from scipy.stats import entropy
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
from matplotlib import rc

rc('text', usetex=True)
rc('font', family='serif', size=12)  # Adjust size for overall font

# Ensure all densities are normalized to sum to 1
def normalize_density(density):
    return density / np.sum(density)
# Mutual Information calculation
def kl_divergence(p_density, q_density):
    p_density = normalize_density(p_density)
    q_density = normalize_density(q_density)
    return entropy(p_density, q_density)

# Count the number of elements in a list
def count_elements(item):
    if isinstance(item, list):
        return len(item)
    else:
        return 1

In [5]:
GENERATION_MODEL_LIST = ["DeepSeek V3", "QWEN 72B", "LLAMA 70B", "LLAMA 8B", "GPT 4o 2025"]
DATASET_NAME = "ICNALE"
TOPIC = "Quantifiers Numerals"
ENG_LANGUAGE = 'English'
language_mapping = {
    'cantonese': 'Cantonese',
    'thai': 'Thai',
    'japanese': 'Japanese',
    'korean': 'Korean',
    'malay': 'Malay',
    'mandarin': 'Mandarin',
    'english': 'English',
    'urdu': 'Urdu',
}

real_language_mapping = {
    'HKG': 'Cantonese', 
    'THA': 'Thai', 
    'JPN': 'Japanese', 
    'KOR': 'Korean',
    'MYS': 'Malay',
    'CHN': 'Mandarin',
    'ENS': 'English',
    'PAK': 'Urdu'
}

revert_lan_mapping = {v: k for k, v in language_mapping.items()}
revert_lan_real_mapping = {v: k for k, v in real_language_mapping.items()}

total_language_list = list(real_language_mapping.values())
real_language_list = list(real_language_mapping.keys())

res_table = pd.DataFrame(
    index=total_language_list,
    columns=pd.MultiIndex.from_product([GENERATION_MODEL_LIST, ['L2_generated_gap', 'Mono_eng_gap']])
)

for MODEL_NAME in GENERATION_MODEL_LIST:
    print(f"Processing {MODEL_NAME} for {TOPIC}...")


    real_path_mapping = {f'{DATASET_NAME}_generation_quantifiers_numerals': "Quantifiers Numerals",
                    f'{DATASET_NAME}_generation_tense_agreement': "Tense Agreement",
                    f'{DATASET_NAME}_generation_reference_word': "Reference Word",
                    f'{DATASET_NAME}_generation_numbers_agreement': "Numbers Agreement",
                    f'{DATASET_NAME}_generation_speech_acts': "Speech Acts",
                    f'{DATASET_NAME}_generation_subject_verb_agreement': "Subject Verb Agreement",
                    f'{DATASET_NAME}_generation_modal_verbs_expressions': "Modal Verbs Expressions",
                    f'{DATASET_NAME}_generation_noun_verb_collocation': "Noun Verb Collocation"
                    }

    path_mapping = {f'{MODEL_NAME}_generation_quantifiers_numerals': "Quantifiers Numerals",
                    f'{MODEL_NAME}_generation_tense_agreement': "Tense Agreement",
                    f'{MODEL_NAME}_generation_reference_word': "Reference Word",
                    f'{MODEL_NAME}_generation_numbers_agreement': "Numbers Agreement",
                    f'{MODEL_NAME}_generation_speech_acts': "Speech Acts",
                    f'{MODEL_NAME}_generation_subject_verb_agreement': "Subject Verb Agreement",
                    f'{MODEL_NAME}_generation_modal_verbs_expressions': "Modal Verbs Expressions",
                    f'{MODEL_NAME}_generation_noun_verb_collocation': "Noun Verb Collocation"
                    }

    revert_mapping = {v: k for k, v in path_mapping.items()}
    revert_real_mapping = {v: k for k, v in real_path_mapping.items()}
 
    # Put data under /annotations/{MODEL_NAME}_output
    path = f'../annotations/{MODEL_NAME}_output'
    for root, folders, files in os.walk(path):
        folder_list = folders
        break

    real_path = '../annotations/ICNALE_output'
    for root, folders, files in os.walk(real_path):
        real_folder_list = folders
        break

    if not os.path.exists(f"../result/{MODEL_NAME}_output"):
        os.makedirs(f"../result/{MODEL_NAME}_output")
        

    feature = revert_mapping[TOPIC]
    pattern = r"[\\/](?P<language>[^\\/]+)_dialog$"
    path = f'../annotations/{MODEL_NAME}_output/{feature}'
    all_data = pd.DataFrame()
    count = 0
    for root, _, files in os.walk(path):
        if count == 0:
            count += 1
            continue 
        #print(root)
        language = re.search(pattern, root).group(1)
        #print(language)
        for json_file in files:
            if json_file.endswith('.json'):
                file_path = os.path.join(root, json_file)
                try:
                    data = pd.read_json(file_path)
                    filename = os.path.splitext(json_file)[0]
                    data['source_file'] = filename
                    data['language'] = language
                    all_data = pd.concat([all_data, data], ignore_index=True)
                except Exception as e:
                    print(file_path)
                    print(f"Error reading {file_path}: {e}")
        count += 1

    all_data['type'] = TOPIC
    all_data['token_num'] = all_data['annotation_tokens'].apply(count_elements)

    real_all_data = pd.DataFrame()
    def parse_filename(filename):
        pattern = r"SD_(\w+)_\d+_.*_(\d+)_([\w+]+)"
        match = re.match(pattern, filename)
        if match:
            language = match.group(1) 
            number = match.group(2)    
            chapter = match.group(3)   
            return language, number, chapter
        return None, None, None

    real_feature = revert_real_mapping[TOPIC]
    real_path =  f'../annotations/ICNALE_output/{real_feature}'
    for root, _, files in os.walk(real_path):
        for json_file in files:
            if json_file.endswith('.json'):
                file_path = os.path.join(root, json_file)
                try:
                    try:
                        data = pd.read_json(file_path, lines=True)
                    except Exception as e:
                        data = pd.read_json(file_path, lines=False)
                    filename = os.path.splitext(json_file)[0]
                    language, number, chapter = parse_filename(filename)
                    data['source_file'] = filename
                    data['language'] = language
                    data['number'] = number
                    data['chapter'] = chapter
                    real_all_data = pd.concat([real_all_data, data], ignore_index=True)
                except ValueError as e:
                    print(f"Error reading {file_path}: {e}")

    real_all_data['type'] = TOPIC
    real_all_data['token_num'] = real_all_data['annotation_tokens'].apply(count_elements)
        
    real_counts = real_all_data.groupby(['language', 'source_file'])['token_num'].sum().reset_index()
    counts = all_data.groupby(['language', 'source_file'])['token_num'].sum().reset_index()

    for target_language in total_language_list:
        lang_counts = counts[counts['language'] == revert_lan_mapping[target_language]]['token_num']
        eng_lang_counts = counts[counts['language'] == revert_lan_mapping[ENG_LANGUAGE]]['token_num']
        real_lang_counts = real_counts[real_counts['language'] == revert_lan_real_mapping[target_language]]['token_num']
        real_lang_ens_counts = real_counts[real_counts['language'] == revert_lan_real_mapping[ENG_LANGUAGE]]['token_num']

        generated_l2_density = gaussian_kde(lang_counts)
        real_l2_density = gaussian_kde(real_lang_counts)
        mono_eng_density = gaussian_kde(eng_lang_counts)
        real_eng_density = gaussian_kde(real_lang_ens_counts)
        x_vals = np.linspace(min(real_lang_counts), min(max(real_lang_counts), 15), 1000)

        l2_generated_gap = kl_divergence(generated_l2_density(x_vals), real_l2_density(x_vals))
        mono_eng_generated_gap = kl_divergence(mono_eng_density(x_vals), real_l2_density(x_vals))
        real_eng_gap = kl_divergence(real_eng_density(x_vals), real_l2_density(x_vals))
        
        print(f"Topic: {TOPIC} with L2: {target_language}")
        print(f"LLM L2 {target_language} Gap: {l2_generated_gap}")
        print(f"LLM Mono ENG Gap: {mono_eng_generated_gap}")
        print(f"LLM Real ENG Gap: {real_eng_gap}")
        print("*" * 50)
        
        res_table.loc[target_language, (MODEL_NAME, 'L2_generated_gap')] = l2_generated_gap
        res_table.loc[target_language, (MODEL_NAME, 'Mono_eng_gap')] = mono_eng_generated_gap
        
        languages = [revert_lan_mapping[target_language], revert_lan_mapping[ENG_LANGUAGE]]
        real_languages = [revert_lan_real_mapping[target_language]]


if not os.path.exists(f"../result/{TOPIC}_output"):
    os.makedirs(f"../result/{TOPIC}_output")

res_table.to_csv(f'../result/{TOPIC}_output/distance_results.csv')
    

Processing DeepSeek V3 for Quantifiers Numerals...
Topic: Quantifiers Numerals with L2: Cantonese
LLM L2 Cantonese Gap: 0.10156414948041916
LLM Mono ENG Gap: 0.05072817216410815
LLM Real ENG Gap: 0.008050188427011495
**************************************************
Topic: Quantifiers Numerals with L2: Thai
LLM L2 Thai Gap: 0.05713823787854394
LLM Mono ENG Gap: 0.03728506658384205
LLM Real ENG Gap: 0.07650307440973682
**************************************************
Topic: Quantifiers Numerals with L2: Japanese
LLM L2 Japanese Gap: 0.059819339717793664
LLM Mono ENG Gap: 0.18053397664293647
LLM Real ENG Gap: 0.27511535679976346
**************************************************
Topic: Quantifiers Numerals with L2: Korean
LLM L2 Korean Gap: 0.02061250997899289
LLM Mono ENG Gap: 0.07031028602722217
LLM Real ENG Gap: 0.09730278397610012
**************************************************
Topic: Quantifiers Numerals with L2: Malay
LLM L2 Malay Gap: 0.07806870065592465
LLM Mono ENG Gap: 0.