In [3]:
pip install pandas googletrans==4.0.0-rc1 deep-translator tqdm



In [5]:
pip install pandas tqdm sklearn deep-translator google-cloud-translate==3.11.2 sentence-transformers

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [6]:
import pandas as pd
import re
import uuid
import hashlib
from pathlib import Path
from tqdm import tqdm
import time
import math
from sklearn.model_selection import train_test_split

# Choose translator backend: "deep_translator" ---
TRANSLATOR_BACKEND = "deep_translator"

SMALL_PATH = "mentalhealth.csv"   # FAQ: Question_ID, Questions, Answers
LARGE_PATH = "train.csv"   # Dialogue: Context, Response
OUT_DIR = Path("output")
OUT_DIR.mkdir(exist_ok=True)
MERGED_EN_PATH = OUT_DIR / "merged_en.csv"
MERGED_MULTILANG_PATH = OUT_DIR / "merged_multilang.csv"
MERGED_JSONL_PATH = OUT_DIR / "merged_multilang.jsonl"

# Translation settings
TARGET_LANGS = ["hi", "te"]  # Hindi, Telugu
BATCH_SIZE = 16              # batch size for translation calls (tuned per backend)
SLEEP_BETWEEN_BATCHES = 0.5  # seconds to reduce rate-limit risk
MAX_RETRIES = 5

# Helper utilities
def clean_text(text):
    if pd.isna(text):
        return ""
    s = str(text)
    s = s.replace("\u00A0", " ")
    s = s.replace("Â", "").replace("â€™", "'").replace("â€œ", '"').replace("â€", '"')
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def make_id(prefix="u"):
    return f"{prefix}_{uuid.uuid4().hex[:12]}"

def stable_fingerprint(s, maxlen=2000):
    if s is None:
        s = ""
    s = clean_text(s).lower()
    s = s[:maxlen]
    return hashlib.md5(s.encode("utf8")).hexdigest()


# 1) Load & normalize datasets
def load_small(path):
    # Try TSV/CSV auto-detection
    p = Path(path)
    if p.suffix in [".tsv", ".txt"]:
        df = pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False)
    else:
        try:
            df = pd.read_csv(path, dtype=str, keep_default_na=False)
        except:
            df = pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False)

    # detect columns
    qid_col = next((c for c in df.columns if c.lower().startswith("question_id")), None)
    q_col   = next((c for c in df.columns if "question" in c.lower() and "id" not in c.lower()), None)
    a_col   = next((c for c in df.columns if "answer" in c.lower()), None)

    if not (q_col and a_col):
        raise ValueError(f"Couldn't detect question/answer columns in {path}. Columns: {list(df.columns)}")

    rows = []
    for _, r in df.iterrows():
        qid = r[qid_col] if qid_col and qid_col in r.index else None
        qtxt = clean_text(r[q_col])
        atxt = clean_text(r[a_col])
        rows.append({
            "id": f"faq_{qid}" if qid and str(qid).strip() else make_id("faq"),
            "source": p.name,
            "type": "faq",
            "prompt": qtxt,
            "response": atxt,
            "lang": "en",
            "orig_id": qid
        })
    return pd.DataFrame(rows)

def load_large(path):
    p = Path(path)
    # robust CSV reading even if responses have newlines; user should ensure correct quoting in CSV
    df = pd.read_csv(path, dtype=str, keep_default_na=False)
    ctx_col = next((c for c in df.columns if "context" in c.lower()), None)
    resp_col = next((c for c in df.columns if "response" in c.lower() or "reply" in c.lower()), None)
    if not (ctx_col and resp_col):
        raise ValueError(f"Couldn't detect Context/Response columns in {path}. Columns: {list(df.columns)}")
    rows = []
    for _, r in df.iterrows():
        ctx = clean_text(r[ctx_col])
        resp = clean_text(r[resp_col])
        fid = stable_fingerprint(ctx + "||" + resp)
        rows.append({
            "id": f"dlg_{fid}",
            "source": p.name,
            "type": "dialogue",
            "prompt": ctx,
            "response": resp,
            "lang": "en",
            "orig_id": None
        })
    return pd.DataFrame(rows)

print("Loading datasets...")
df_small = load_small(SMALL_PATH)
df_large = load_large(LARGE_PATH)
merged = pd.concat([df_small, df_large], ignore_index=True, sort=False)
print("Initial merged rows:", len(merged))

# Deduplicate (exact prompt+response fingerprint)
merged["fp"] = merged.apply(lambda r: stable_fingerprint((r["prompt"] or "") + "||" + (r["response"] or "")), axis=1)
before = len(merged)
merged = merged.drop_duplicates(subset=["fp"]).reset_index(drop=True)
after = len(merged)
print(f"Deduped: {before} -> {after}")

# Drop empty prompts/responses
merged = merged[~((merged["prompt"].str.strip()=="") | (merged["response"].str.strip()==""))].reset_index(drop=True)
print("After dropping empty rows:", len(merged))

# Ensure id exists
def ensure_id(row):
    if row["id"] and str(row["id"]).strip():
        return row["id"]
    return f"u_{row['fp'][:12]}"
merged["id"] = merged.apply(ensure_id, axis=1)

# Keep columns order
merged = merged[["id","source","type","prompt","response","lang","orig_id","fp"]]

# Save merged English
merged.to_csv(MERGED_EN_PATH, index=False)
print("Saved merged English to", MERGED_EN_PATH)


# 2) Translation helpers (two backends)
if TRANSLATOR_BACKEND == "google_cloud":
    # Google Cloud translate v3
    from google.cloud import translate_v3 as translate
    client = translate.TranslationServiceClient()
    GCP_PROJECT = "YOUR-GCP-PROJECT-ID"
    LOCATION = "global"
    parent = f"projects/{GCP_PROJECT}/locations/{LOCATION}"

    def gc_translate_batch(texts, target_lang):
        # Google can take a list of contents
        for attempt in range(MAX_RETRIES):
            try:
                response = client.translate_text(
                    request={
                        "parent": parent,
                        "contents": texts,
                        "mime_type": "text/plain",
                        "target_language_code": target_lang
                    }
                )
                return [r.translated_text for r in response.translations]
            except Exception as e:
                wait = 2**attempt
                print(f"GC translate error: {e}. retrying in {wait}s")
                time.sleep(wait)
        # fallback: return originals
        return texts

    def translate_batch(texts, target_lang):
        # split into chunks that are not too large
        out = []
        for i in range(0, len(texts), BATCH_SIZE):
            batch = texts[i:i+BATCH_SIZE]
            out.extend(gc_translate_batch(batch, target_lang))
            time.sleep(SLEEP_BETWEEN_BATCHES)
        return out

elif TRANSLATOR_BACKEND == "deep_translator":
    from deep_translator import GoogleTranslator as DeepGoogle
    # deep-translator will call translate.google.com under the hood
    def deep_translate_one(text, target_lang):
        for attempt in range(MAX_RETRIES):
            try:
                return DeepGoogle(source='auto', target=target_lang).translate(text)
            except Exception as e:
                wait = 2**attempt
                # sometimes transient; backoff
                time.sleep(wait)
        return text

    def translate_batch(texts, target_lang):
        out = []
        for i in tqdm(range(0, len(texts), 1), desc=f"Translating to {target_lang}"):
            t = texts[i]
            out.append(deep_translate_one(t, target_lang))
            # small sleep to reduce chance of rate limiting
            if i % 10 == 0:
                time.sleep(0.2)
        return out
else:
    raise ValueError("Unsupported TRANSLATOR_BACKEND: choose 'google_cloud' or 'deep_translator'")


# 3) Translate merged dataset into target langs
def batch_translate_df(df, target_langs):
    # Returns a dataframe with original rows plus translations (rows repeated with same id, lang changed)
    rows = []
    n = len(df)
    for lang in target_langs:
        print(f"Translating to {lang} ...")
        prompts = df["prompt"].tolist()
        responses = df["response"].tolist()
        # translate in two passes (prompt and response)
        t_prompts = translate_batch(prompts, lang)
        t_responses = translate_batch(responses, lang)
        # cleaned translations
        t_prompts = [clean_text(t) for t in t_prompts]
        t_responses = [clean_text(t) for t in t_responses]

        for i in range(n):
            base = df.iloc[i]
            rows.append({
                "id": base["id"],
                "source": base["source"],
                "type": base["type"],
                "prompt": t_prompts[i],
                "response": t_responses[i],
                "lang": lang,
                "orig_id": base["orig_id"],
                "fp": stable_fingerprint(t_prompts[i] + "||" + t_responses[i])
            })
        # brief rest to reduce risk
        time.sleep(SLEEP_BETWEEN_BATCHES)

    # Also include original English rows
    en_rows = df.to_dict(orient="records")
    # convert en_rows to same schema naming expected
    en_formatted = []
    for r in en_rows:
        en_formatted.append({
            "id": r["id"],
            "source": r["source"],
            "type": r["type"],
            "prompt": r["prompt"],
            "response": r["response"],
            "lang": "en",
            "orig_id": r["orig_id"],
            "fp": r["fp"]
        })

    all_rows = en_formatted + rows
    out_df = pd.DataFrame(all_rows)
    return out_df

print("Translating merged dataset into target languages:", TARGET_LANGS)
multilang = batch_translate_df(merged, TARGET_LANGS)
print("Multilang rows:", len(multilang))
multilang.to_csv(MERGED_MULTILANG_PATH, index=False)
print("Saved multilingual merged file:", MERGED_MULTILANG_PATH)

# Save JSONL for fine-tuning (prompt/response pairs grouped by language)
import json
with open(MERGED_JSONL_PATH, "w", encoding="utf-8") as fh:
    for _, r in multilang.iterrows():
        # For model fine-tuning JSONL style: {"prompt":"...","completion":"..."}
        obj = {"id": r["id"], "lang": r["lang"], "prompt": r["prompt"], "response": r["response"]}
        fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
print("Saved JSONL to", MERGED_JSONL_PATH)

print("All finished. Files in", OUT_DIR)


Loading datasets...
Initial merged rows: 3609
Deduped: 3609 -> 2125
After dropping empty rows: 2121
Saved merged English to output/merged_en.csv
Translating merged dataset into target languages: ['hi', 'te']
Translating to hi ...


Translating to hi: 100%|██████████| 2121/2121 [12:57<00:00,  2.73it/s]
Translating to hi: 100%|██████████| 2121/2121 [39:04<00:00,  1.11s/it]


Translating to te ...


Translating to te: 100%|██████████| 2121/2121 [20:39<00:00,  1.71it/s]
Translating to te: 100%|██████████| 2121/2121 [48:27<00:00,  1.37s/it]


Multilang rows: 6363
Saved multilingual merged file: output/merged_multilang.csv
Saved JSONL to output/merged_multilang.jsonl
All finished. Files in output
