<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/translate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
OVERRIDES_JSON = "overrides_expanded.json"


In [6]:
# ============================================
# Translator (Deep) with Expanded Overrides + Cache Purge + Debug
# ============================================

!pip install -q pandas deep-translator

import os, re, json, pandas as pd
from deep_translator import GoogleTranslator

INPUT_CSV  = "SaudiFoodFile.csv"
OUTPUT_CSV = "SaudiFoodFile_english_FIXED.csv"
CACHE_CSV  = "translation_cache.csv"

# Prefer expanded overrides if present
OVR_EXP   = "overrides_expanded.json"
OVR_BASE  = "overrides.json"
OVERRIDES_JSON = OVR_EXP if os.path.exists(OVR_EXP) else OVR_BASE

HANDLE_CLASSIFICATIONS = True  # split 'classifications' by '|'
TRANSLATE_COLS = None          # None -> all object columns

# ---------- helpers ----------
AR_DIAC = re.compile(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]")
def norm_ar(s: str) -> str:
    s = AR_DIAC.sub("", s)
    s = s.replace("\u0640","")
    s = s.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    s = s.replace("ى","ي").replace("ئ","ي").replace("ؤ","و").replace("ٱ","ا")
    return s

def key_norm(x: str) -> str:
    return norm_ar(str(x).strip().lower())

POST_FIX = {
    "black lemon": "black lime",
    "nail": "cloves",
    "cardamon": "cardamom",
    "yougurt": "yogurt",
    "youghurt": "yogurt",
}

def apply_postfix(en: str) -> str:
    return POST_FIX.get(str(en).strip().lower(), str(en).strip())

# ---------- load data ----------
# CSV
try:
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_CSV, encoding="cp1256")

# Overrides
if os.path.exists(OVERRIDES_JSON):
    with open(OVERRIDES_JSON, "r", encoding="utf-8") as f:
        OV = json.load(f)
else:
    OV = {}

# Normalized override view (for Arabic variant matching)
OV_NORM = {key_norm(k): v for k, v in OV.items() if re.search(r"[\u0600-\u06FF]", k)}
print(f"🔧 Using overrides file: {OVERRIDES_JSON}")
print(f"   Loaded overrides: {len(OV)} (normalized Arabic keys: {len(OV_NORM)})")
# show a few samples for sanity
for i,(k,v) in enumerate(list(OV.items())[:8]):
    print(f"   • {k}  ->  {v}")
    if i>=7: break

# Cache (load then purge entries that now have overrides)
if os.path.exists(CACHE_CSV):
    cache_df = pd.read_csv(CACHE_CSV)
    CACHE = dict(cache_df.values)  # {raw: english}
else:
    CACHE = {}

def override_lookup(text: str):
    if text in OV:
        return OV[text]
    kn = key_norm(text)
    if kn in OV_NORM:
        return OV_NORM[kn]
    return None

# Purge cache entries that should now be overridden
purged = 0
to_del = []
for raw in list(CACHE.keys()):
    if override_lookup(raw):
        to_del.append(raw)
for raw in to_del:
    CACHE.pop(raw, None)
    purged += 1
print(f"🧹 Purged {purged} cache entries that now have overrides")

translator = GoogleTranslator(source="auto", target="en")

def translate_text(text: str) -> str:
    if pd.isna(text) or str(text).strip() == "":
        return text
    s = str(text).strip()

    # 1) override wins (exact or normalized)
    ov = override_lookup(s)
    if ov:
        return ov

    # 2) cache
    if s in CACHE:
        return CACHE[s]

    # 3) machine translation
    try:
        en = translator.translate(s) or s
        en = apply_postfix(en)
    except Exception:
        en = s  # keep original on error

    CACHE[s] = en
    return en

def translate_classifications_cell(cell: str) -> str:
    parts = [p.strip() for p in str(cell).split("|")]
    out = []
    for p in parts:
        if not p:
            continue
        ov = override_lookup(p)
        en = ov if ov else translate_text(p)
        out.append(str(en).lower())
    return " | ".join(out)

# ---------- choose columns ----------
obj_cols = [c for c in df.columns if df[c].dtype == "object"]
cols = obj_cols if TRANSLATE_COLS is None else [c for c in TRANSLATE_COLS if c in df.columns]
print(f"📝 Translating columns: {cols}")

# ---------- translate ----------
for col in cols:
    print(f"➡️  Translating: {col}")
    if HANDLE_CLASSIFICATIONS and col.lower() == "classifications":
        df[col] = df[col].astype(str).apply(translate_classifications_cell)
    else:
        df[col] = df[col].apply(translate_text)

# ---------- save ----------
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
pd.DataFrame(list(CACHE.items()), columns=["raw","english"]).to_csv(CACHE_CSV, index=False)

print(f"✅ Done: {OUTPUT_CSV}")
print(f"💾 Cache: {CACHE_CSV}")
print(f"✍️ Overrides file in use: {OVERRIDES_JSON}")


🔧 Using overrides file: overrides.json
   Loaded overrides: 0 (normalized Arabic keys: 0)
🧹 Purged 0 cache entries that now have overrides
📝 Translating columns: ['dish_name', 'classifications', 'image_file', 'scrape_date']
➡️  Translating: dish_name
➡️  Translating: classifications
➡️  Translating: image_file
➡️  Translating: scrape_date
✅ Done: SaudiFoodFile_english_FIXED.csv
💾 Cache: translation_cache.csv
✍️ Overrides file in use: overrides.json
