In [None]:
#Code for translating in punjabi first, then backtranslating into english and then comparing the cosine similarities of both the english sentences *the ground truth vs the backtranslated one) and then adding up those sentences which have threshold above 0.85
#done for training data

import pandas as pd
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer, util

# ─── 1) CONFIGURATION ────────────────────────────────────────────────────────
INPUT_CSV_PATH      = "is_test.csv"           # Your English CSV
OUTPUT_CSV_PATH     = "is_test_punjabi.csv"   # Output Punjabi CSV
SIMILARITY_THRESHOLD = 0.85

# Initialize Deep-Translator’s GoogleTranslator and SBERT
#   - source='en', target='pa'  means English → Punjabi
#   - source='pa', target='en'  means Punjabi → English
eng_to_pa = GoogleTranslator(source='en', target='pa')
pa_to_eng = GoogleTranslator(source='pa', target='en')

# SBERT model for computing cosine similarity
sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# ─── 2) READ INPUT CSV ───────────────────────────────────────────────────────
df = pd.read_csv(INPUT_CSV_PATH, dtype=str)  # Read all columns as string
text_columns = list(df.columns)

# ─── 3) TRANSLATE, BACK-TRANSLATE & FILTER ─────────────────────────────────
for col in text_columns:
    punjabi_texts = []
    back_translated_texts = []

    # a) Translate each sentence to Punjabi, then back to English
    for sentence in df[col].fillna("").tolist():
        # English → Punjabi
        try:
            pu_translation = eng_to_pa.translate(sentence)
        except Exception as e:
            pu_translation = ""
        punjabi_texts.append(pu_translation)

        # Punjabi → English (back-translation)
        try:
            back_eng = pa_to_eng.translate(pu_translation)
        except Exception as e:
            back_eng = ""
        back_translated_texts.append(back_eng)

    # b) Compute SBERT embeddings & cosine similarities
    originals = df[col].fillna("").tolist()
    # If all original or back-translated texts are empty, skip similarity computation
    if any(originals) and any(back_translated_texts):
        embeddings_orig = sbert.encode(originals, convert_to_tensor=True)
        embeddings_back = sbert.encode(back_translated_texts, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings_orig, embeddings_back).diag().cpu().numpy()
    else:
        cosine_scores = [0.0] * len(originals)

    # c) Filter: keep Punjabi translation if similarity ≥ threshold, else blank
    filtered_punjabi = [
        pu if score >= SIMILARITY_THRESHOLD else ""
        for pu, score in zip(punjabi_texts, cosine_scores)
    ]

    # d) Add a new column named "<original_column>_pa"
    df[f"{col}_pa"] = filtered_punjabi

# ─── 4) SAVE OUTPUT CSV ──────────────────────────────────────────────────────
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"✅ Punjabi CSV saved as: {OUTPUT_CSV_PATH}")


  from .autonotebook import tqdm as notebook_tqdm


✅ Punjabi CSV saved as: is_test_punjabi.csv


In [None]:
import os
import logging
import random
import time
import numpy as np
import pandas as pd
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer, util, losses, InputExample
from torch.utils.data import DataLoader
from tqdm import tqdm

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )


def compute_thresholds(scores: np.ndarray):
    valid = scores[scores > 0]
    if len(valid) == 0:
        return {}
    T15 = np.percentile(valid, 15)
    T25 = np.percentile(valid, 25)
    T50 = np.percentile(valid, 50)
    T75 = np.percentile(valid, 75)
    Tmin = T25 - 1.5 * (T75 - T25)
    return {'T15': T15, 'T25': T25, 'T50': T50, 'Tmin': Tmin}


def select_threshold(th: dict, choice: str):
    if choice in th:
        return th[choice]
    raise ValueError(f"Unknown threshold choice {choice}")


def translate_with_retry(translator, text, desc, max_retries=None, delay=1):
    """
    Retry translation until success (non-empty) or until max_retries if specified.
    """
    attempts = 0
    while True:
        try:
            result = translator.translate(text)
            if result:
                return result
            else:
                raise ValueError("Empty translation")
        except Exception as e:
            attempts += 1
            logging.warning(f"[{desc}] translation attempt #{attempts} failed: {e}")
            if max_retries and attempts >= max_retries:
                logging.error(f"[{desc}] reached max retries={max_retries}. Returning empty string.")
                return ""
            time.sleep(delay)


def translate_and_filter(
    input_csv: str,
    output_csv: str,
    threshold_choice: str = 'T25',
    sample_below: int = 50,
    model_name: str = 'paraphrase-multilingual-MiniLM-L12-v2'
):
    setup_logging()
    logging.info(f"Loading data from {input_csv}")
    df = pd.read_csv(input_csv, dtype=str)
    columns = df.columns.tolist()

    logging.info(f"Initializing translators and SBERT model: {model_name}")
    eng_to_pa = GoogleTranslator(source='en', target='pa')
    pa_to_eng = GoogleTranslator(source='pa', target='en')
    sbert = SentenceTransformer(model_name)

    for col in columns:
        logging.info(f"Processing column: '{col}' ({len(df)} sentences)")
        originals = df[col].fillna("").tolist()
        punjabi_texts = []
        back_translated = []

        logging.info("Translating to Punjabi and back-translating to English with retries...")
        for sent in tqdm(originals, desc=f"Translating '{col}'", unit="sent"):
            pu = translate_with_retry(eng_to_pa, sent, desc="en->pa")
            punjabi_texts.append(pu)

            back = translate_with_retry(pa_to_eng, pu, desc="pa->en")
            back_translated.append(back)

        logging.info("Computing SBERT embeddings and cosine similarities...")
        if any(originals) and any(back_translated):
            emb_o = sbert.encode(originals, convert_to_tensor=True)
            emb_b = sbert.encode(back_translated, convert_to_tensor=True)
            scores = util.cos_sim(emb_o, emb_b).diag().cpu().numpy()
        else:
            scores = np.zeros(len(originals))

        ths = compute_thresholds(scores)
        thr = select_threshold(ths, threshold_choice)
        logging.info(f"Thresholds: {ths}; Selected ({threshold_choice}) = {thr:.3f}")

        below_idxs = [i for i, sc in enumerate(scores) if sc < thr]
        sample_n = min(len(below_idxs), sample_below)
        sampled_below = random.sample(below_idxs, sample_n) if sample_n > 0 else []
        logging.info(f"Sampling {len(sampled_below)} below-threshold sentences (out of {len(below_idxs)})")

        filtered = []
        for idx, (pu, sc) in enumerate(zip(punjabi_texts, scores)):
            if sc >= thr or idx in sampled_below:
                filtered.append(pu)
            else:
                filtered.append("")
        df[f"{col}_pa"] = filtered

    df.to_csv(output_csv, index=False)
    logging.info(f"Filtered translations saved to {output_csv}")


def fine_tune_sbert(
    parallel_csv: str,
    en_col: str,
    pa_col: str,
    pretrained_model: str = 'paraphrase-multilingual-MiniLM-L12-v2',
    output_path: str = 'fine-tuned-en-pa-sbert',
    batch_size: int = 16,
    epochs: int = 3
):
    logging.info(f"Loading parallel corpus from {parallel_csv}")
    df = pd.read_csv(parallel_csv, dtype=str)
    examples = []
    for idx, row in df.iterrows():
        en = row.get(en_col, '').strip()
        pa = row.get(pa_col, '').strip()
        if en and pa:
            examples.append(InputExample(texts=[en, pa]))
        else:
            logging.debug(f"Skipping invalid row {idx}: en='{en}', pa='{pa}'")

    if not examples:
        logging.error("No valid parallel examples found; aborting fine-tuning.")
        return

    logging.info(f"Initializing SBERT model for fine-tuning: {pretrained_model}")
    model = SentenceTransformer(pretrained_model)
    loader = DataLoader(examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    logging.info(f"Starting fine-tuning: epochs={epochs}, batch_size={batch_size}")
    model.fit(
        train_objectives=[(loader, train_loss)],
        epochs=epochs,
        warmup_steps=int(len(examples) * epochs * 0.1),
        output_path=output_path,
        show_progress_bar=True
    )
    logging.info(f"Model fine-tuned and saved to {output_path}")


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(
        description='Translate/filter (with retries) and fine-tune SBERT on English–Punjabi data')
    parser.add_argument('--mode', choices=['translate', 'finetune', 'both'], default='both')
    parser.add_argument('--input_csv', type=str, default='is_test.csv')
    parser.add_argument('--output_csv', type=str, default='is_test_punjabi.csv')
    parser.add_argument('--parallel_csv', type=str, default='en_pa_parallel.csv')
    parser.add_argument('--en_col', type=str, default='english')
    parser.add_argument('--pa_col', type=str, default='punjabi')
    parser.add_argument('--threshold', type=str, default='T25', help='one of T15,T25,T50,Tmin')
    parser.add_argument('--sample_below', type=int, default=50, help='how many low-sim samples to keep')
    parser.add_argument('--epochs', type=int, default=3)
    parser.add_argument('--batch_size', type=int, default=16)
    args, unknown = parser.parse_known_args()

    logging.info(f"Script started in mode={args.mode}")

    if args.mode in ('translate', 'both'):
        translate_and_filter(
            args.input_csv,
            args.output_csv,
            threshold_choice=args.threshold,
            sample_below=args.sample_below
        )

    if args.mode in ('finetune', 'both'):
        fine_tune_sbert(
            args.parallel_csv,
            args.en_col,
            args.pa_col,
            pretrained_model='paraphrase-multilingual-MiniLM-L12-v2',
            output_path='fine-tuned-en-pa-sbert',
            batch_size=args.batch_size,
            epochs=args.epochs
        )
    logging.info("Script finished.")


2025-06-12 20:32:07 - INFO - Script started in mode=both
2025-06-12 20:32:07 - INFO - Loading data from is_test.csv
2025-06-12 20:32:07 - INFO - Initializing translators and SBERT model: paraphrase-multilingual-MiniLM-L12-v2
2025-06-12 20:32:07 - INFO - Use pytorch device_name: cpu
2025-06-12 20:32:07 - INFO - Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
2025-06-12 20:32:12 - INFO - Processing column: '1' (4500 sentences)
2025-06-12 20:32:12 - INFO - Translating to Punjabi and back-translating to English with retries...
Translating '1':  38%|███▊      | 1718/4500 [2:04:05<1:59:36,  2.58s/sent]

In [None]:
#same done on testing data set, initially contained 1000 values, cut down to 500 values


import pandas as pd
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer, util

# ─── 1) CONFIGURATION ────────────────────────────────────────────────────────
INPUT_CSV_PATH      = "200_sampled.csv"           # Your English CSV
OUTPUT_CSV_PATH     = "200_sampled_output.csv"   # Output Punjabi CSV
SIMILARITY_THRESHOLD = 0.85

# Initialize Deep-Translator’s GoogleTranslator and SBERT
#   - source='en', target='pa'  means English → Punjabi
#   - source='pa', target='en'  means Punjabi → English
eng_to_pa = GoogleTranslator(source='en', target='pa')
pa_to_eng = GoogleTranslator(source='pa', target='en')

# SBERT model for computing cosine similarity
sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# ─── 2) READ INPUT CSV ───────────────────────────────────────────────────────
df = pd.read_csv(INPUT_CSV_PATH, dtype=str)  # Read all columns as string
text_columns = list(df.columns)

# ─── 3) TRANSLATE, BACK-TRANSLATE & FILTER ─────────────────────────────────
for col in text_columns:
    punjabi_texts = []
    back_translated_texts = []

    # a) Translate each sentence to Punjabi, then back to English
    for sentence in df[col].fillna("").tolist():
        # English → Punjabi
        try:
            pu_translation = eng_to_pa.translate(sentence)
        except Exception as e:
            pu_translation = ""
        punjabi_texts.append(pu_translation)

        # Punjabi → English (back-translation)
        try:
            back_eng = pa_to_eng.translate(pu_translation)
        except Exception as e:
            back_eng = ""
        back_translated_texts.append(back_eng)

    # b) Compute SBERT embeddings & cosine similarities
    originals = df[col].fillna("").tolist()
    # If all original or back-translated texts are empty, skip similarity computation
    if any(originals) and any(back_translated_texts):
        embeddings_orig = sbert.encode(originals, convert_to_tensor=True)
        embeddings_back = sbert.encode(back_translated_texts, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings_orig, embeddings_back).diag().cpu().numpy()
    else:
        cosine_scores = [0.0] * len(originals)

    # c) Filter: keep Punjabi translation if similarity ≥ threshold, else blank
    filtered_punjabi = [
        pu if score >= SIMILARITY_THRESHOLD else ""
        for pu, score in zip(punjabi_texts, cosine_scores)
    ]

    # d) Add a new column named "<original_column>_pa"
    df[f"{col}_pa"] = filtered_punjabi

# ─── 4) SAVE OUTPUT CSV ──────────────────────────────────────────────────────
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"✅ Punjabi CSV saved as: {OUTPUT_CSV_PATH}")


  from .autonotebook import tqdm as notebook_tqdm


✅ Punjabi CSV saved as: 200_sampled_output.csv


In [None]:

#removing those rows having unfilled cells
import numpy as np
pa_cols = [c for c in df.columns if c.endswith("_pa")]

# replace empty with NaN
df.replace("", np.nan, inplace=True)

# drop rows where **any** of the pa columns is NaN
df.dropna(subset=pa_cols, how="any", inplace=True)

# save
df.to_excel("sampled.xlsx", index=False)


In [None]:
#randomly picking up 200 from the csv


import random

input_path = "sampled.xlsx"
output_path = "final_sampled.xlsx"

samples = 200

df = pd.read_excel(input_path)
sampled_df= df.sample(samples, random_state=42)

sampled_df.to_excel(output_path, index= False)
print("done")

done
