In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import csv
from datasets import Dataset
from collections import Counter, defaultdict

#CHECK ACCURACY OF THE LANGUAGE TAGS COMPARING WITH THE CLASSIFICATION OF GPT-4.1-MINI

#API config
load_dotenv("key.env")
api_key = os.getenv("OPENAI_TOKEN")
client = OpenAI(api_key=api_key)

test_corpus = Dataset.from_parquet("test_randomsplit.parquet") #generated with create_corpus.ipynb

In [None]:
#count conversations of each language
language_counts = Counter(test_corpus['language'])

#obtain the 20 most common languages in the corpus
top_20_languages = [lang for lang, _ in language_counts.most_common(20)]

#print the top 20 languages in order
print("Top 20 most common languages:")
for i, lang in enumerate(top_20_languages, 1):
    print(f"{i}. {lang} ({language_counts[lang]} conversations)")

#convert to dataframe
df = test_corpus.to_pandas()

#extract 100 random conversations of each of the 20 most common languages
df_top_20 = df[df['language'].isin(top_20_languages)]
df_balanced = df_top_20.groupby('language', group_keys=False).apply(lambda x: x.sample(n=min(len(x), 100), random_state=42))

#convert to dataset
balanced_dataset = Dataset.from_pandas(df_balanced.reset_index(drop=True))

#save the conversations
balanced_dataset.to_parquet("balanced_top20_100each.parquet")


In [None]:
#load dataset with 100 conversations of each of the top 20 languages
dataset = Dataset.from_parquet("balanced_top20_100each.parquet")

#prepare csv files to save results
output_file = "gpt_language_identification_all.csv" #save the results of the inference 
metrics_file = "language_accuracy_metrics.csv" #metric of accuracy for the 20 languages

#counters for each language
counts = defaultdict(int) #total examples per language     
corrects = defaultdict(int) #conversations whose language tag agrees with the classification of GPT-4.1-mini

with open(output_file, mode='w', newline='', encoding='utf-8') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["language", "conversation", "answer_gpt", "correct"]) #fields for the csv "gpt_language_identification_all.csv"
    
    #iterate the dataset
    for example in dataset:
        gold_language = example["language"]
        if gold_language == "unknown": #one of the 20 most common tags is "unknown": we skip that one
            continue
        conversation  = example["conversation"]
        
        #API call
        try:
            resp = client.chat.completions.create(
                model="gpt-4.1-mini-2025-04-14", 
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that identifies the language of a conversation."},
                    {"role": "user", "content": f"What language is this conversation written in? Answer with only one word, and write the name of the language in English (e.g., French, Vietnamese, Spanish):\n\n{conversation}"}

                ],
                temperature=0
            )
            answer = resp.choices[0].message.content.strip()
        except Exception as e:
            answer = f"ERROR: {e}"
        
        #compare tags (ignore upper case letters)
        is_correct = (answer.lower() == gold_language.lower()) #bool
        
        #update counters
        counts[gold_language]   += 1
        corrects[gold_language] += int(is_correct)
        
        #write the result of the API call
        writer.writerow([gold_language, conversation, answer, is_correct])

#calculate and save metrics
with open(metrics_file, mode='w', newline='', encoding='utf-8') as f_met:
    writer = csv.writer(f_met)
    writer.writerow(["language", "total", "correct", "accuracy"])
    for lang in sorted(counts):
        total   = counts[lang]
        corr    = corrects[lang]
        acc     = corr / total if total > 0 else 0.0
        writer.writerow([lang, total, corr, f"{acc:.2%}"])