In [None]:
import os
import glob
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from joblib import Parallel, delayed
from langdetect import detect
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

os.chdir("/Users/simon/Documents/repo/cities-learning")

# -----------------------
# 1. Combine feather files
# -----------------------
def combine_files(directory, pattern):
    files = glob.glob(os.path.join(directory, pattern))
    print(f"Found {len(files)} files.")
    dfs = [pd.read_feather(f) for f in files]
    df = pd.concat(dfs, ignore_index=True)
    return df

# -----------------------
# 2. Clean dataset
# -----------------------
def clean_data(df):
    print(f"initial number of observations: {len(df)}")
    df = df.drop_duplicates(subset="id")
    print(f"observations after dropping exact duplicates: {len(df)}")

    df = df[~df["title"].str.contains(".xlsx", na=False)]
    exclude_types = ["dataset", "erratum", "retraction", "peer-review", "reference-entry"]
    df = df[~df["type"].isin(exclude_types)]

    print(f"observations after dropping the following types {exclude_types}: {len(df)}")
    return df

# -----------------------
# 3. Sentence splitting
# -----------------------
def split_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

# -----------------------
# 4. Language detection
# -----------------------
def detect_language_safe(text):
    try:
        return detect(text)
    except:
        return "unknown"

# -----------------------
# 5. Load NLLB translator
# -----------------------
nllb_model_name = "facebook/nllb-200-distilled-600M"
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_name)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_name)
nllb_model = nllb_model.to("cpu")  # Ensure model runs on CPU to avoid GPU memory issues

# Full mapping from langdetect codes to NLLB language codes
langdetect_to_nllb = {
    "af": "afr_Latn", "am": "amh_Ethi", "ar": "arb_Arab", "az": "azj_Latn", "be": "bel_Cyrl", "bg": "bul_Cyrl",
    "bn": "ben_Beng", "bs": "bos_Latn", "ca": "cat_Latn", "ceb": "ceb_Latn", "cs": "ces_Latn", "cy": "cym_Latn",
    "da": "dan_Latn", "de": "deu_Latn", "el": "ell_Grek", "en": "eng_Latn", "es": "spa_Latn", "et": "est_Latn",
    "eu": "eus_Latn", "fa": "pes_Arab", "fi": "fin_Latn", "fr": "fra_Latn", "gl": "glg_Latn", "gu": "guj_Gujr",
    "ha": "hau_Latn", "hi": "hin_Deva", "hr": "hrv_Latn", "ht": "hat_Latn", "hu": "hun_Latn", "hy": "hye_Armn",
    "id": "ind_Latn", "ig": "ibo_Latn", "is": "isl_Latn", "it": "ita_Latn", "iw": "heb_Hebr", "ja": "jpn_Jpan",
    "jv": "jav_Latn", "ka": "kat_Geor", "kk": "kaz_Cyrl", "km": "khm_Khmr", "kn": "kan_Knda", "ko": "kor_Hang",
    "lo": "lao_Laoo", "lt": "lit_Latn", "lv": "lvs_Latn", "mg": "plt_Latn", "mi": "mri_Latn", "mk": "mkd_Cyrl",
    "ml": "mal_Mlym", "mn": "khk_Cyrl", "mr": "mar_Deva", "ms": "zsm_Latn", "mt": "mlt_Latn", "my": "mya_Mymr",
    "ne": "npi_Deva", "nl": "nld_Latn", "no": "nob_Latn", "pa": "pan_Guru", "pl": "pol_Latn", "ps": "pbt_Arab",
    "pt": "por_Latn", "ro": "ron_Latn", "ru": "rus_Cyrl", "si": "sin_Sinh", "sk": "slk_Latn", "sl": "slv_Latn",
    "so": "som_Latn", "sq": "als_Latn", "sr": "srp_Cyrl", "su": "sun_Latn", "sv": "swe_Latn", "sw": "swh_Latn",
    "ta": "tam_Taml", "te": "tel_Telu", "th": "tha_Thai", "tl": "tgl_Latn", "tr": "tur_Latn", "uk": "ukr_Cyrl",
    "ur": "urd_Arab", "uz": "uzn_Latn", "vi": "vie_Latn", "xh": "xho_Latn", "yi": "ydd_Hebr", "zh-cn": "zho_Hans",
    "zh-tw": "zho_Hant", "zu": "zul_Latn", "unknown": "eng_Latn"
}

def translate_sentence_nllb(text, src_lang="eng_Latn", tgt_lang="eng_Latn"):
    try:
        inputs = nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs["forced_bos_token_id"] = nllb_tokenizer.lang_code_to_id[tgt_lang]
        translated = nllb_model.generate(**inputs)
        return nllb_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"Translation error: {e}")
        return text

# -----------------------
# 6. Row processing logic
# -----------------------
def process_row(i, clim_deb_it, variable):
    result = {"translated": "", "lang": "", "filt": ""}
    
    try:
        text = clim_deb_it.iloc[i][variable]
        if not pd.isna(text):
            text_sentences = split_sentences(text)
            text_languages = [detect_language_safe(s) for s in text_sentences]

            translated_sentences = []
            for s, lang in zip(text_sentences, text_languages):
                lang_code = langdetect_to_nllb.get(lang.lower(), "eng_Latn")
                if lang_code != "eng_Latn":
                    translated = translate_sentence_nllb(s, src_lang=lang_code)
                    translated_sentences.append(translated)
                else:
                    translated_sentences.append(s)

            result["translated"] = " ".join(translated_sentences)
            result["lang"] = ", ".join(set(text_languages))
            result["filt"] = "translated" if any(lang != "en" for lang in text_languages) else "text was already in english"
    except Exception as e:
        print(f"Error at row {i}: {e}")
    
    return result

# -----------------------
# 7. Apply translation in parallel (limited to 5 cores)
# -----------------------
def translate_dataframe(df, variable="abstract", n_jobs=5):
    tqdm_bar = tqdm(range(len(df)), desc="Processing rows")
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_row)(i, df, variable) for i in tqdm_bar
    )

    translated = [r["translated"] for r in results]
    languages = [r["lang"] for r in results]
    filtering = [r["filt"] for r in results]

    df[f"{variable}_en"] = translated
    df[f"{variable}_languages"] = languages
    df[f"{variable}_language_filtering"] = filtering

    return df

# -----------------------
# 8. Split and write
# -----------------------
def split_dataframe(df, chunk_size):
    return [df.iloc[i:i + chunk_size].copy() for i in range(0, len(df), chunk_size)]

def write_chunks(dfs, base_filename, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for i, df in enumerate(dfs):
        df.reset_index(drop=True).to_feather(f"{output_dir}/{base_filename}_{i+1}.feather")

# -----------------------
# 9. Full pipeline
# -----------------------
def run_translation_pipeline(    
    input_dir = "data/OpenAlex/02_NA_added",
    output_dir = "data/OpenAlex/03_translated",
    pattern = "city_works_df_NA_abstr_added_*.feather",
    base_filename = "city_works_df_translated",
    var = "abstract"):

    print("Combining input files...")
    data = combine_files(input_dir, pattern)

    print("Cleaning data...")
    data = clean_data(data)

    print("Splitting data...")
    chunks = split_dataframe(data, chunk_size=100)

    results = []
    for i, chunk in enumerate(chunks):
        print(f"Translating chunk {i+1}/{len(chunks)}")
        translated_chunk = translate_dataframe(chunk, variable=var, n_jobs=2)
        results.append(translated_chunk)
        break  # Remove to process all chunks

    print("Saving results...")
    write_chunks(results, base_filename, output_dir)
    print("Translation complete.")

# -----------------------
# 10. Run the script
# -----------------------
if __name__ == "__main__":
    run_translation_pipeline()


In [27]:


df = pd.read_feather("data/OpenAlex/02_NA_added/city_works_df_NA_abstr_added_1.feather")
result = process_row(0, df, "abstract")
print(result)


{'translated': 'The most important anthropogenic influences on climate are the emission of greenhouse gases1 and changes in land use, such as urbanization and agriculture2. But it has been difficult to separate these two influences because both tend to increase the daily mean surface temperature3,4. The impact of urbanization has been estimated by comparing observations in cities with those in surrounding rural areas, but the results differ significantly depending on whether population data5 or satellite measurements of night light6-8 are used to classify urban and rural areas7,8. Here we use the difference between trends in observed surface temperatures in the continental United States and the corresponding trends in a reconstruction of surface temperatures determined from a reanalysis of global weather over the past 50 years, which is insensitive to surface observations, to estimate the impact of land-use changes on surface warming. Our results suggest that half of the observed decre