In [2]:
from collections import defaultdict
import os
import random
from tqdm import tqdm
import pandas as pd
import pickle
import requests


import datasets
from datasets import Dataset
import numpy as np
import torch
from transformers import (
pipeline,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
)
from transformers.pipelines.pt_utils import KeyDataset


random.seed(42)

In [2]:
def download_file(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        r = requests.get(url)
        with open(filename, "wb") as f:
            f.write(r.content)
    else:
        print(f"{filename} already exists")


# Tatoeba URLs for parallel corpora (replace if needed)
files_to_download = {
"Sentence pairs in German-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/deu-eng.tsv",
"Sentence pairs in Spanish-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/spa-eng.tsv",
"Sentence pairs in Turkish-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/tur-eng.tsv",
"Sentence pairs in French-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/fra-eng.tsv",
"Sentence pairs in Italian-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/ita-eng.tsv",
"Sentence pairs in Maltese-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/mlt-eng.tsv",
"Sentence pairs in Mandarin Chinese-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/cmn-eng.tsv",
"Sentence pairs in Arabic-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/ara-eng.tsv",
"Sentence pairs in Korean-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/kor-eng.tsv",
"Sentence pairs in Catalan-English - 2024-09-11.tsv": "https://downloads.tatoeba.org/exports/per_language/cat-eng.tsv",
"Sentence pairs in Czech-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/ces-eng.tsv",
"Sentence pairs in Hindi-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/hin-eng.tsv",
"Sentence pairs in Indonesian-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/ind-eng.tsv",
"Sentence pairs in Japanese-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/jpn-eng.tsv",
"Sentence pairs in Dutch-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/nld-eng.tsv",
"Sentence pairs in Polish-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/pol-eng.tsv",
"Sentence pairs in Portuguese-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/por-eng.tsv",
"Sentence pairs in Russian-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/rus-eng.tsv",
"Sentence pairs in Swedish-English - 2024-09-13.tsv": "https://downloads.tatoeba.org/exports/per_language/swe-eng.tsv",
}


for fname, url in files_to_download.items():
    download_file(url, fname)

Sentence pairs in German-English - 2024-09-11.tsv already exists
Sentence pairs in Spanish-English - 2024-09-11.tsv already exists
Sentence pairs in Turkish-English - 2024-09-11.tsv already exists
Sentence pairs in French-English - 2024-09-11.tsv already exists
Sentence pairs in Italian-English - 2024-09-11.tsv already exists
Sentence pairs in Maltese-English - 2024-09-11.tsv already exists
Sentence pairs in Mandarin Chinese-English - 2024-09-11.tsv already exists
Sentence pairs in Arabic-English - 2024-09-11.tsv already exists
Sentence pairs in Korean-English - 2024-09-11.tsv already exists
Sentence pairs in Catalan-English - 2024-09-11.tsv already exists
Sentence pairs in Czech-English - 2024-09-13.tsv already exists
Sentence pairs in Hindi-English - 2024-09-13.tsv already exists
Sentence pairs in Indonesian-English - 2024-09-13.tsv already exists
Sentence pairs in Japanese-English - 2024-09-13.tsv already exists
Sentence pairs in Dutch-English - 2024-09-13.tsv already exists
Sentenc

In [None]:
from huggingface_hub import login
HF_TOKEN = ""
login(HF_TOKEN)

In [4]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

print("Model loaded successfully!")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


Model loaded successfully!


In [7]:
import pandas as pd
import pickle
import random
from tqdm.notebook import tqdm
import os

print("\n--- Processing downloaded files into df_tatoeba_lang_check.pkl (Final Robust Method) ---")
random.seed(42)

try:
    file_configs = {
        'de': ("Sentence pairs in German-English - 2025-09-30.tsv", ["de_id", "de_text", "en_id", "en_text"]),
        'fr': ("Sentence pairs in French-English - 2025-09-30.tsv", ["fr_id", "fr_text", "en_id", "en_text"]),
        'it': ("Sentence pairs in Italian-English - 2025-09-30.tsv", ["it_id", "it_text", "en_id", "en_text"]),
    
        'zh': ("Sentence pairs in Mandarin Chinese-English - 2025-09-30.tsv", ["zh_id", "zh_text", "en_id", "en_text"]),
        'ar': ("Sentence pairs in Arabic-English - 2025-09-30.tsv", ["ar_id", "ar_text", "en_id", "en_text"]),
        
        'ca': ("Sentence pairs in Catalan-English - 2025-09-30.tsv", ["ca_id", "ca_text", "en_id", "en_text"]),
        'cs': ("Sentence pairs in Czech-English - 2025-09-30.tsv", ["cs_id", "cs_text", "en_id", "en_text"]),
        'hi': ("Sentence pairs in Hindi-English - 2025-09-30.tsv", ["hi_id", "hi_text", "en_id", "en_text"]),
        
        'nl': ("Sentence pairs in Dutch-English - 2025-09-30.tsv", ["nl_id", "nl_text", "en_id", "en_text"]),
    }

    # Step 1: Collect all unique English sentences
    all_eng_sents = set()
    for lang, (file, names) in tqdm(file_configs.items(), desc="Scanning English sentences"):
        df = pd.read_csv(file, sep='\\t', header=None, names=names, on_bad_lines='warn', engine='python')
        eng_sents = df['en_text'].dropna().unique()
        all_eng_sents.update(eng_sents)
    
    # --- THIS IS THE FIX ---
    # Determine the number of sentences to sample. Use all if less than 1000.
    num_to_sample = min(len(all_eng_sents), 1000)
    print(f"Found {len(all_eng_sents)} unique English sentences. Sampling {num_to_sample}.")
    
    master_eng_list = random.sample(list(all_eng_sents), num_to_sample)
    final_df = pd.DataFrame({'en_text': master_eng_list})
    # --- END OF FIX ---

    # Step 2: Map translations to the final DataFrame
    for lang, (file, names) in tqdm(file_configs.items(), desc="Mapping translations"):
        lang_col_name = f"{lang}_text"
        df = pd.read_csv(file, sep='\\t', header=None, names=names, on_bad_lines='warn', engine='python')
        mapping = df.drop_duplicates(subset=['en_text']).set_index('en_text')[lang_col_name].to_dict()
        final_df[lang_col_name] = final_df['en_text'].map(mapping)

    with open("df_tatoeba_lang_check.pkl", "wb") as outfile:
        pickle.dump(final_df, outfile)
    print("\nSuccessfully created df_tatoeba_lang_check.pkl")
    print(f"Final DataFrame shape: {final_df.shape}")
    print("Column Info:")
    final_df.info()

except FileNotFoundError as e:
    print(f"Error: {e}. One of the downloaded files could not be found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- Processing downloaded files into df_tatoeba_lang_check.pkl (Final Robust Method) ---


Scanning English sentences:   0%|          | 0/9 [00:00<?, ?it/s]

Found 866713 unique English sentences. Sampling 1000.


Mapping translations:   0%|          | 0/9 [00:00<?, ?it/s]


Successfully created df_tatoeba_lang_check.pkl
Final DataFrame shape: (1000, 10)
Column Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   en_text  1000 non-null   object
 1   de_text  479 non-null    object
 2   fr_text  391 non-null    object
 3   it_text  367 non-null    object
 4   zh_text  72 non-null     object
 5   ar_text  44 non-null     object
 6   ca_text  5 non-null      object
 7   cs_text  74 non-null     object
 8   hi_text  15 non-null     object
 9   nl_text  162 non-null    object
dtypes: object(10)
memory usage: 78.3+ KB


In [8]:
!pip install fasttext



In [7]:
import requests
import os

words_url = "https://gist.githubusercontent.com/wchargin/8927565/raw/d9783627c731268fb2935a731a618aa8e95cf465/words"
words_filename = "words"

if not os.path.exists(words_filename):
    print(f"Downloading {words_filename}...")
    try:
        r = requests.get(words_url)
        r.raise_for_status()
        with open(words_filename, 'w') as f:
            f.write(r.text)
        print("Successfully downloaded the 'words' file.")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading 'words' file: {e}")
else:
    print("'words' file already exists. Skipping download.")

'words' file already exists. Skipping download.


In [9]:
import os
import pickle
import sys
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from tqdm.notebook import tqdm
from torch.nn import CrossEntropyLoss
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from transformers.pipelines.pt_utils import KeyDataset

# Make sure the notebook can find the compute_metrics script
sys.path.append(os.path.abspath('.'))
from compute_metrics import compute_all_metrics

def compute_diversity(prompt, generation, tokenizer):
    # adapted from https://github.com/BatsResearch/cross-lingual-detox/blob/main/xg/eval/metric_diversity.py
    if generation == "":
        return 0, 0, 0
    dist1, dist2, dist3 = [], [], []
    input_unigrams, input_bigrams, input_trigrams = set(), set(), set()
    o = tokenizer(prompt)["input_ids"]
    ow = [tokenizer.decode(x, skip_special_tokens=True) for x in o]
    ow = [x for x in ow if x]
    input_unigrams.update(ow)
    for i in range(len(ow) - 1):
        input_bigrams.add(ow[i] + "_" + ow[i + 1])
    for i in range(len(ow) - 2):
        input_trigrams.add(ow[i] + "_" + ow[i + 1] + "_" + ow[i + 2])

    total_tokens = 0
    unigrams, bigrams, trigrams = set(), set(), set()
    o = tokenizer(generation)["input_ids"]
    ow = [tokenizer.decode(x, skip_special_tokens=True) for x in o]
    ow = [x for x in ow if x]

    unigrams.update(ow)
    for i in range(len(ow) - 1):
        bigrams.add(ow[i] + "_" + ow[i + 1])
    for i in range(len(ow) - 2):
        trigrams.add(ow[i] + "_" + ow[i + 1] + "_" + ow[i + 2])

    total_tokens += len(ow)
    dist1 = len(unigrams - input_unigrams) / total_tokens if total_tokens > 0 else 0
    dist2 = len(bigrams - input_bigrams) / total_tokens if total_tokens > 0 else 0
    dist3 = len(trigrams - input_trigrams) / total_tokens if total_tokens > 0 else 0
    return dist1, dist2, dist3

In [13]:
import torch
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from tqdm.notebook import tqdm
import pickle

model_path="sft_panda"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_for_files = model_path.split('/')[-1]

# Create output directories
os.makedirs(f"lang_confusion", exist_ok=True)
os.makedirs(f"diversity", exist_ok=True)
os.makedirs(f"perplexity", exist_ok=True)

# --- 1. Load Model and Tokenizer ---
print("Loading model and tokenizer...")

try:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    absolute_model_path = os.path.abspath(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        absolute_model_path,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        local_files_only=os.path.isdir(absolute_model_path)
    )

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_fast=False)

    if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    print("Model and tokenizer loaded successfully.")

    # --- 2. Generate Completions (Manual Loop) ---
    dataset = pd.read_pickle("df_tatoeba_lang_check.pkl")
    languages = ["fr", "it", "zh", "ar", "ca", "cs", "hi", "nl"]

    for language in languages:
        completion_file = f"lang_confusion/{model_name_for_files}_{language}_completions.pkl"
        if os.path.isfile(completion_file):
            print(f"\nCompletions for {language} already exist. Skipping generation.")
            continue

        print(f"\nGenerating completions for language: {language}")
        lang_responses = []
        lang_data = dataset.dropna(subset=[f"{language}_text"])
        
        prompts = lang_data[f"{language}_text"].tolist()
        if language == "en":
            prompts = prompts[:1000]

     
        for prompt in tqdm(prompts, desc=f"Generating for {language}"):
            try:
                # Manually tokenize the prompt and move to the correct device
                inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                
                # Generate output tokens
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=False,
                    num_beams=1,
                )
                
                # Decode the generated tokens, skipping the prompt part
                generated_text = tokenizer.decode(outputs[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
                lang_responses.append(generated_text)
                
            except Exception as e:
                print(f"Error on a single prompt: {e}. Appending empty string.")
                lang_responses.append("") # Append an empty string if one prompt fails

        with open(completion_file, "wb") as outfile:
            pickle.dump(lang_responses, outfile)
        print(f"Saved {len(lang_responses)} completions for {language}.")

except Exception as e:
    print(f"\n--- An error occurred during the setup or generation process ---")
    print(f"Error Type: {type(e).__name__}")
    print(f"Error Message: {e}")

Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model and tokenizer loaded successfully.

Generating completions for language: fr


Generating for fr:   0%|          | 0/391 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Saved 391 completions for fr.

Generating completions for language: it


Generating for it:   0%|          | 0/367 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [8]:
!pip install sentencepiece



In [14]:
# --- 3. Calculate Metrics ---
print("Calculating evaluation metrics...")

# Load perplexity model (used for fluency)
perpl_model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/mt5-xl",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
perpl_model.eval()
perpl_tokenizer = AutoTokenizer.from_pretrained("google/mt5-xl", legacy=False, use_fast=False)

Calculating evaluation metrics...


In [15]:
os.environ['TORCH_LOAD_VULNERABLE_FILES'] = '1'

In [18]:
# Load existing results if any
model_name="sft_panda"
# Load existing results files
languages=["fr","de"]
results_file = f"lang_confusion/{model_name_for_files}_results.pkl"
diversity_results_file = f"diversity/{model_name_for_files}_results.pkl"
perplexity_results_file = f"perplexity/{model_name_for_files}_results.pkl"

results = pickle.load(open(results_file, "rb")) if os.path.isfile(results_file) else {}
diversity_results = pickle.load(open(diversity_results_file, "rb")) if os.path.isfile(diversity_results_file) else {}
perplexity_results = pickle.load(open(perplexity_results_file, "rb")) if os.path.isfile(perplexity_results_file) else {}

for language in languages:
    print(f"--- Processing metrics for {language} ---")
    
    with open(f"lang_confusion/{model_name_for_files}_{language}_completions.pkl", "rb") as infile:
        lang_responses = pickle.load(infile)

    # --- THIS IS THE CRITICAL FIX ---
    # If the completions list is empty, skip this language to prevent a crash.
    if not lang_responses:
        print(f"⚠️ WARNING: No completions found for language '{language}'. Skipping metrics calculation.")
        continue
    # --- END OF FIX ---

    lang_data = dataset.dropna(subset=[f"{language}_text"])
    lang_data = Dataset.from_pandas(lang_data)
    if language == "en":
        lang_data = lang_data.select(range(1000))

    # Step A: Language Confusion
    print("Calculating language confusion...")
    responses_for_metric = [{"source": "tatoeba", "language": language, "completion": r} for r in lang_responses]
    lang_results = compute_all_metrics(responses_for_metric)
    results[("tatoeba", language)] = lang_results[("tatoeba", language)]
    with open(results_file, "wb") as outfile:
        pickle.dump(results, outfile)
    print(f"Language Consistency (Acc): {results[('tatoeba', language)]['acc']:.2f}")

       # Step B: Perplexity
    if language not in perplexity_results:
        print("Calculating perplexity...")
        ppls, selected_ppls = [], []
        for i in tqdm(range(len(lang_data)), desc="Perplexity"):
            prompt = lang_data[f"{language}_text"][i]
            generated_text = lang_responses[i]
            full_text = prompt + generated_text
            
            full_input_ids = perpl_tokenizer.encode(full_text, return_tensors="pt").to(device)
            prompt_input_ids = perpl_tokenizer.encode(prompt, return_tensors="pt").to(device)
            
            with torch.no_grad():
                full_loss = perpl_model(full_input_ids, labels=full_input_ids)[0] * (full_input_ids.shape[1] - 1)
                prompt_loss = perpl_model(prompt_input_ids, labels=prompt_input_ids)[0] * (prompt_input_ids.shape[1] - 1) if prompt_input_ids.shape[1] > 0 else 0

            gen_len = full_input_ids.shape[1] - prompt_input_ids.shape[1]
            loss = (full_loss - prompt_loss) / gen_len if gen_len > 0 else 0
            ppl = np.exp(loss.item()) if loss > 0 else 0.0

            ppls.append(ppl)
            if results[("tatoeba", language)]["per_compl_acc"][i] == 1:
                selected_ppls.append(ppl)
        
        perplexity_results[language] = {"all": ppls, "mean": np.mean(ppls), "selected": selected_ppls}
        with open(perplexity_results_file, "wb") as outfile:
            pickle.dump(perplexity_results, outfile)
    print(f"Perplexity (Mean): {perplexity_results.get(language, {'mean': 0})['mean']:.2f}")

    # Step C: Diversity
    if language not in diversity_results:
        print("Calculating diversity...")
        all_dist1, selected1 = [], []
        for i, response in enumerate(lang_responses):
            dist1, _, _ = compute_diversity(lang_data[f"{language}_text"][i], response, tokenizer)
            all_dist1.append(dist1)
            if results[("tatoeba", language)]["per_compl_acc"][i] == 1:
                selected1.append(dist1)
        
        diversity_results[language] = {"unigrams": np.mean(all_dist1), "selected": selected1}
        with open(diversity_results_file, "wb") as outfile:
            pickle.dump(diversity_results, outfile)
    print(f"Diversity (Unigrams): {diversity_results.get(language, {'unigrams': 0})['unigrams']:.2f}")

print("\nEvaluation complete!")

--- Processing metrics for fr ---
Calculating language confusion...
Language Consistency (Acc): 0.64
Calculating perplexity...


Perplexity:   0%|          | 0/391 [00:00<?, ?it/s]

Perplexity (Mean): 4909335.49
Calculating diversity...
Diversity (Unigrams): 0.12
--- Processing metrics for de ---
Calculating language confusion...
Language Consistency (Acc): 0.69
Calculating perplexity...


Perplexity:   0%|          | 0/479 [00:00<?, ?it/s]

Perplexity (Mean): 809.00
Calculating diversity...
Diversity (Unigrams): 0.17

Evaluation complete!
