In [None]:
pip install transformers accelerate openai

In [None]:
pip install -U huggingface_hub fsspec datasets

In [None]:
import huggingface_hub
huggingface_hub.login() # Login to Hugging Face Hub. You will be prompted to enter your token. Please refer to the README on how to acquire a huggingface token.

In [None]:
from datasets import load_dataset
import pandas as pd
import os
os.environ["HF_DATASETS_OFFLINE"] = "0"

orig_eng = load_dataset("gsarti/flores_101", name="eng", split="devtest")
orig_zul = load_dataset("gsarti/flores_101", name="zul", split="devtest")

corr_eng = load_dataset("openlanguagedata/flores_plus", name="eng_Latn", split="devtest")
corr_zul = load_dataset("openlanguagedata/flores_plus", name="zul_Latn", split="devtest")

# Sanity check
assert len(orig_eng) == len(corr_eng) == len(orig_zul) == len(corr_zul)

pairs = []
for i in range(len(orig_eng)):
    pairs.append({
        "source": corr_eng[i]["text"],
        "original": orig_zul[i]["sentence"],
        "corrected": corr_zul[i]["text"]
    })

df = pd.DataFrame(pairs)

# Preview
print(df.head())

# Optional: Save for later
df.to_csv("zulu_translation_pairs.csv", index=False)

In [None]:
from IPython.display import display

#Should you want to sample the DataFrame to see a subset of the data
# sample_df = df.sample(n=10, random_state=42) # Uncomment to sample 10 rows for inspection

comparison = df[["source", "original", "corrected"]].copy() #replace df with sample_df if you want to see a subset of the data
comparison.columns = ["English", "Original Zulu", "Corrected (FLORES+)"]

# Optional: filter out identical LLM outputs for insight
comparison["Match?"] = comparison["Original Zulu"] == comparison["Corrected (FLORES+)"]

display(comparison)

In [None]:
correct = comparison[comparison["Match?"] == True]
print(f"\n✅ Perfect matches: {len(correct)} / {len(comparison)}")
display(correct)

In [None]:
mistakes = comparison[comparison["Match?"] == False]
print(f"\n⚠️ Mismatches: {len(mistakes)} / {len(comparison)}")
display(mistakes)

In [None]:
import openai

client = openai.OpenAI(api_key="<YOUR_API_KEY>")  # your real api key here

def gpt_error_classifier(row):
    prompt = f"""You are a Zulu translation expert. Compare the original FLORES and corrected FLORES Zulu translations. Identify the type of error and explain it.

English: {row['English']}
Original Zulu: {row['Original Zulu']}
Corrected Zulu: {row['Corrected (FLORES+)']}

Give a short answer:
Error Type: <TYPE>
Explanation: <why the original was wrong>
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a Zulu linguistic error analyst."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
        )
        output = response.choices[0].message.content.strip()

        if "Error Type:" in output:
            parts = output.split("Explanation:")
            error_type = parts[0].replace("Error Type:", "").strip()
            explanation = parts[1].strip() if len(parts) > 1 else ""
            return pd.Series([error_type, explanation])
        else:
            return pd.Series(["Unclear", output])

    except Exception as e:
        return pd.Series(["Error", str(e)])

In [None]:
comparison[["LLM Error Type", "LLM Explanation"]] = comparison.apply(gpt_error_classifier, axis=1)

In [None]:
comparison[[
    "English",
    "Original Zulu",
    "Corrected (FLORES+)",
    "LLM Error Type",
    "LLM Explanation"
]]

In [None]:
error_type_counts = comparison["LLM Error Type"].value_counts().reset_index()
error_type_counts.columns = ["Error Type", "Count"]
display(error_type_counts)

In [None]:
group_mapping = {
    "No Error": "Correct",
    "None": "Correct",
    "Correct": "Correct",

    # Lexical / Word Choice
    "Word Choice": "Lexical",
    "Lexical Choice": "Lexical",
    "Word Choice and Clarity": "Lexical",
    "Lexical and Grammatical Error": "Lexical",
    "Word Choice and Spelling": "Lexical",
    "Word Choice and Verb Form": "Lexical",
    "Word Choice/Error in Terminology": "Lexical",
    "Lexical and Semantic Errors": "Lexical",
    "Lexical Choice and Clarity": "Lexical",
    "Lexical Error": "Lexical",
    "Lexical and Structural": "Lexical",
    "Word Choice/Error in Verb Form": "Lexical",
    "Word Choice and Grammar": "Lexical",
    "Word Choice/Error in Terminology": "Lexical",
    "Spelling and Word Choice": "Lexical",
    "Word Choice and Agreement": "Lexical",
    "Word Choice and Structure": "Lexical",
    "Word Choice and Omission": "Lexical",

    # Structural
    "Translation and Terminology Errors": "Structural",
    "Lexical and Structural Error": "Structural",
    "Lexical and Structural": "Structural",
    "Lexical and Grammatical Errors": "Structural",
    "Word Order": "Structural",

    # Grammar/Agreement
    "Agreement Error": "Grammar",
    "Subject Agreement Error": "Grammar",
    "Agreement and Word Choice": "Grammar",
    "Verb Tense and Agreement": "Grammar",
    "Verb Tense Consistency": "Grammar",
    "Verb Usage": "Grammar",
    "Possessive Agreement Error": "Grammar",
    "Grammar and Clarity": "Grammar",
    "Agreement Error": "Grammar",
    "Punctuation": "Grammar",

    # Spelling/Orthographic
    "Spelling Error": "Spelling",
    "Typographical Error": "Spelling",
    "Orthographic Error": "Spelling",
    "Punctuation Error": "Spelling",
    "Quotation Mark Usage": "Spelling",
    "Punctuation": "Spelling",

    # Omission
    "Omission": "Omission",
    "Omission Error": "Omission",
    "Word Choice and Omission": "Omission",

    # Other types
    "Mistranslation": "Mistranslation",
    "Repetition Error": "Mistranslation",
    "Translation Error": "Mistranslation",
    "Unit of Measurement Error": "Other"
}

comparison["Grouped Error Type"] = comparison["LLM Error Type"].map(group_mapping).fillna("Other")

error_summary = comparison["Grouped Error Type"].value_counts().reset_index()
error_summary.columns = ["Grouped Error Type", "Count"]

error_summary

#Part 2 - Sepedi

In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
import os
os.environ["HF_DATASETS_OFFLINE"] = "0"

orig_eng = load_dataset("gsarti/flores_101", name="eng", split="devtest")
orig_nso = load_dataset("gsarti/flores_101", name="nso", split="devtest")

corr_eng = load_dataset("openlanguagedata/flores_plus", name="eng_Latn", split="devtest")
corr_nso = load_dataset("openlanguagedata/flores_plus", name="nso_Latn", split="devtest")

# Sanity check
assert len(orig_eng) == len(corr_eng) == len(orig_nso) == len(corr_nso)

pairs = []
for i in range(len(orig_eng)):
    pairs.append({
        "source": corr_eng[i]["text"],
        "original": orig_nso[i]["sentence"],
        "corrected": corr_nso[i]["text"]
    })

df = pd.DataFrame(pairs)

# Preview
print(df.head())

# Optional: Save for later
df.to_csv("sepedi_translation_pairs.csv", index=False)

In [None]:
for idx, row in df.iterrows():
    print(f"\n🔤 English: {row['source']}")
    print(f"❌ Original: {row['original']}")
    print(f"✅ Corrected (FLORES+): {row['corrected']}")

In [None]:
from IPython.display import display

# Should you want to sample the DataFrame to see a subset of the data
# sample_df = df.sample(n=10, random_state=42) # Uncomment to sample 10 rows for inspection

comparison = df[["source", "original", "corrected"]].copy() #replace df with sample_df if you want to see a subset of the data
comparison.columns = ["English", "Original Sepedi", "Corrected (FLORES+)"]

# Optional: filter out identical LLM outputs for insight
comparison["Match?"] = comparison["Original Sepedi"] == comparison["Corrected (FLORES+)"]

display(comparison)

In [None]:
correct = comparison[comparison["Match?"] == True]
print(f"\n✅ Perfect matches: {len(correct)} / {len(comparison)}")
display(correct)

In [None]:
mistakes = comparison[comparison["Match?"] == False]
print(f"\n⚠️ Mismatches: {len(mistakes)} / {len(comparison)}")
display(mistakes)

In [None]:
import openai

client = openai.OpenAI(api_key="<YOUR_API_KEY>")  # your real key here

def gpt_error_classifier(row):
    prompt = f"""You are a Northern Sotho (Sepedi) translation expert. Compare the original FLORES and corrected FLORES Sepedi translations. Identify the type of error and explain it.

English: {row['English']}
Original Sepedi: {row['Original Sepedi']}
Corrected Sepedi: {row['Corrected (FLORES+)']}

Give a short answer:
Error Type: <TYPE>
Explanation: <why the original was wrong>
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a Sepedi linguistic error analyst."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
        )
        output = response.choices[0].message.content.strip()

        if "Error Type:" in output:
            parts = output.split("Explanation:")
            error_type = parts[0].replace("Error Type:", "").strip()
            explanation = parts[1].strip() if len(parts) > 1 else ""
            return pd.Series([error_type, explanation])
        else:
            return pd.Series(["Unclear", output])

    except Exception as e:
        return pd.Series(["Error", str(e)])

In [None]:
comparison[["LLM Error Type", "LLM Explanation"]] = comparison.apply(gpt_error_classifier, axis=1)

In [None]:
comparison[[
    "English",
    "Original Sepedi",
    "Corrected (FLORES+)",
    "LLM Error Type",
    "LLM Explanation"
]]

In [None]:
error_type_counts = comparison["LLM Error Type"].value_counts().reset_index()
error_type_counts.columns = ["Error Type", "Count"]
print(error_type_counts)

In [None]:
group_mapping = {
    "No Error": "Correct",
    "None": "Correct",
    "Correct": "Correct",

    # Lexical / Word Choice
    "Word Choice": "Lexical",
    "Lexical Choice": "Lexical",
    "Word Choice and Clarity": "Lexical",
    "Lexical and Grammatical Error": "Lexical",
    "Word Choice and Spelling": "Lexical",
    "Word Choice and Verb Form": "Lexical",
    "Word Choice/Error in Terminology": "Lexical",
    "Lexical and Semantic Errors": "Lexical",
    "Lexical Choice and Clarity": "Lexical",
    "Lexical Error": "Lexical",
    "Lexical and Structural": "Lexical",
    "Word Choice/Error in Verb Form": "Lexical",
    "Word Choice and Grammar": "Lexical",
    "Word Choice/Error in Terminology": "Lexical",
    "Spelling and Word Choice": "Lexical",
    "Word Choice and Agreement": "Lexical",
    "Word Choice and Structure": "Lexical",
    "Word Choice and Omission": "Lexical",

    # Structural
    "Translation and Terminology Errors": "Structural",
    "Lexical and Structural Error": "Structural",
    "Lexical and Structural": "Structural",
    "Lexical and Grammatical Errors": "Structural",
    "Word Order": "Structural",

    # Grammar/Agreement
    "Agreement Error": "Grammar",
    "Subject Agreement Error": "Grammar",
    "Agreement and Word Choice": "Grammar",
    "Verb Tense and Agreement": "Grammar",
    "Verb Tense Consistency": "Grammar",
    "Verb Usage": "Grammar",
    "Possessive Agreement Error": "Grammar",
    "Grammar and Clarity": "Grammar",
    "Agreement Error": "Grammar",
    "Punctuation": "Grammar",

    # Spelling/Orthographic
    "Spelling Error": "Spelling",
    "Typographical Error": "Spelling",
    "Orthographic Error": "Spelling",
    "Punctuation Error": "Spelling",
    "Quotation Mark Usage": "Spelling",
    "Punctuation": "Spelling",

    # Omission
    "Omission": "Omission",
    "Omission Error": "Omission",
    "Word Choice and Omission": "Omission",

    # Other types
    "Mistranslation": "Mistranslation",
    "Repetition Error": "Mistranslation",
    "Translation Error": "Mistranslation",
    "Unit of Measurement Error": "Other"
}

comparison["Grouped Error Type"] = comparison["LLM Error Type"].map(group_mapping).fillna("Other")

error_summary = comparison["Grouped Error Type"].value_counts().reset_index()
error_summary.columns = ["Grouped Error Type", "Count"]

error_summary