In [12]:
import pandas as pd

# Configuration
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/combined_emails_with_natural_pii.csv"
OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_with_subjects.csv"
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"

def extract_subject(email):
    """Extract subject with placeholder for missing subjects"""
    if not isinstance(email, str):
        return MISSING_SUBJECT_TOKEN
    
    # Case-insensitive subject detection
    email_lower = email.lower()
    if "subject:" in email_lower[:100]:  # Only check first 100 chars
        subject_marker_index = email_lower.find("subject:") + len("subject:")
        subject = email[subject_marker_index:].split('\n', 1)[0].strip()
        return subject if subject else MISSING_SUBJECT_TOKEN
    
    return MISSING_SUBJECT_TOKEN

# Read original data
print("Reading source data...")
df = pd.read_csv(INPUT_PATH)

# Create new dataframe with required columns
print("Processing emails...")
new_df = pd.DataFrame({
    'subject': df['email'].apply(extract_subject),
    'email': df['email'],  # Keep original complete email
    'type': df['type']
})

# Verification
total_emails = len(new_df)
missing_subjects = sum(new_df['subject'] == MISSING_SUBJECT_TOKEN)
print(f"\nProcessing complete:")
print(f"Total emails: {total_emails}")
print(f"Emails with '{MISSING_SUBJECT_TOKEN}': {missing_subjects} ({missing_subjects/total_emails:.1%})")

# Save to new CSV
new_df.to_csv(OUTPUT_PATH, index=False)
print(f"\nSaved to: {OUTPUT_PATH}")
print("Final columns:", new_df.columns.tolist())

# Show sample
print("\nSample data:")
print(new_df.head(3))

Reading source data...
Processing emails...

Processing complete:
Total emails: 24000
Emails with '[NO_SUBJECT]': 1929 (8.0%)

Saved to: /Users/vighneshms/Downloads/Email_classifier/models/emails_with_subjects.csv
Final columns: ['subject', 'email', 'type']

Sample data:
                                             subject  \
0  Unvorhergesehener Absturz der Datenanalyse-Pla...   
1                           Customer Support Inquiry   
2                      Data Analytics for Investment   

                                               email      type  
0  Subject: Unvorhergesehener Absturz der Datenan...  Incident  
1  Subject: Customer Support Inquiry\n\nSeeking i...   Request  
2  Subject: Data Analytics for Investment\n\nI am...   Request  


In [10]:
# Add column to flag emails where subject wasn't extracted
df['subject_extracted'] = df['subject'] != ""

# Save verification report
verification_path = output_path.replace(".csv", "_verification.csv")
df.to_csv(verification_path, index=False)
print(f"\nFull verification data saved to: {verification_path}")


Full verification data saved to: /Users/vighneshms/Downloads/Email_classifier/models/emails_subject_body_type2_verification.csv


In [7]:
pip install langdetect

Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=81b0ae2790832e08c08cabebb44e8167306fd12eed2d312191215a83dea46a4e
  Stored in directory: /Users/vighneshms/Library/Caches/pip/wheels/d1/c1/d9/7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Note: you may need to restart the kernel to use updated packages.


In [8]:
from langdetect import detect, LangDetectException
import pandas as pd

def detect_language(text):
    try:
        return detect(text) if pd.notna(text) and text.strip() else 'unknown'
    except LangDetectException:
        return 'unknown'

# Detect language for each email
df['language'] = df['body'].apply(detect_language)

# Check distribution
print(df['language'].value_counts())

language
en    15321
de     6897
es      810
fr      484
pt      475
nl       12
it        1
Name: count, dtype: int64


In [1]:
pip install torch transformers pandas tqdm deep-translator langdetect

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
 pip install sacremoses

Defaulting to user installation because normal site-packages is not writeable
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
!python3 -m pip install --upgrade 'optree>=0.13.0'

Defaulting to user installation because normal site-packages is not writeable
Collecting optree>=0.13.0
  Downloading optree-0.15.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (48 kB)
Downloading optree-0.15.0-cp39-cp39-macosx_11_0_arm64.whl (329 kB)
Installing collected packages: optree
  Attempting uninstall: optree
    Found existing installation: optree 0.12.1
    Uninstalling optree-0.12.1:
      Successfully uninstalled optree-0.12.1
Successfully installed optree-0.15.0


In [3]:
!pip install --upgrade jupyter ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Defaulting to user installation because normal site-packages is not writeable
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.6-py3-none-any.whl.metadata (2.4 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.4.0-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Downloading nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.1-py3-none-any.whl.metadata (16 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.14 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.14-py3-none-any.whl.metadata (4.1 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Downloading as

In [23]:
import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
from tqdm import tqdm
import torch
from threading import Lock
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

# Paths
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_with_subjects.csv"
OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_translated2.csv"
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Counters
counters = {'total': 0, 'translated': 0, 'skipped': 0, 'failed': 0, 'english': 0}
counter_lock = Lock()
last_display = ""

def update_counter(key, value=1):
    global last_display
    with counter_lock:
        counters[key] += value
        display_str = (f"Total: {counters['total']} | Translated: {counters['translated']} | "
                       f"English: {counters['english']} | Skipped: {counters['skipped']} | Failed: {counters['failed']}")
        if display_str != last_display:
            print("\r" + " " * len(last_display), end="", flush=True)
            print("\r" + display_str, end="", flush=True)
            last_display = display_str

# Load model
print("⚡ Loading translation model...")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en").to(DEVICE)
model.eval()

def mps_translate(text, max_length=512):
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        return text
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        outputs = model.generate(**inputs)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"\nModel translation error: {str(e)}")
        return text

def hybrid_translate(text):
    update_counter('total')
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        update_counter('skipped')
        return text
    try:
        if detect(text) == 'en':
            update_counter('english')
            return text
    except LangDetectException:
        pass
    translated = mps_translate(text)
    if translated != text:
        update_counter('translated')
        return translated
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        if translated:
            update_counter('translated')
            return translated
    except Exception as e:
        update_counter('failed')
        print(f"\nGoogle Translate failed: {str(e)}")
    return text

# Load original data
print("📂 Loading data...")
df = pd.read_csv(INPUT_PATH)

# Resume support
translated_rows = 0  # default

if os.path.exists(OUTPUT_PATH):
    print("⏳ Resuming from saved progress...")
    translated_df = pd.read_csv(OUTPUT_PATH)
    
    # 👇 Sanity check: make sure it's a DataFrame
    if isinstance(translated_df, pd.DataFrame):
        translated_rows = len(translated_df)
    else:
        print("⚠️ Warning: Failed to read translated CSV properly.")
        translated_rows = 0  # fallback
else:
    translated_df = pd.DataFrame(columns=['subject', 'subject_en', 'email', 'email_en', 'type'])

# Process and save row-by-row
print("\n🚀 Translating...")
with open(OUTPUT_PATH, 'a', encoding='utf-8', newline='') as f:
    for idx, row in tqdm(df.iterrows(), total=int(len(df)), initial=int(translated_rows)):
        subj, email, typ = row['subject'], row['email'], row.get('type', "")
        subj_en = hybrid_translate(subj)
        email_en = hybrid_translate(email)
        result_row = pd.DataFrame([[subj, subj_en, email, email_en, typ]],
                                  columns=['subject', 'subject_en', 'email', 'email_en', 'type'])
        result_row.to_csv(f, header=(idx == translated_rows and translated_rows == 0), index=False)

print("\n✅ Translation complete!")
print("Final counts:")
for k, v in counters.items():
    print(f"{k.title()}: {v}")


⚡ Loading translation model...
📂 Loading data...
⏳ Resuming from saved progress...


EmptyDataError: No columns to parse from file

In [26]:
import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
from tqdm import tqdm
import torch
from threading import Lock
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
import signal
import sys

# Paths
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_with_subjects.csv"
OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_translated2.csv"
CHECKPOINT_PATH = OUTPUT_PATH + ".checkpoint"
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Counters
counters = {'total': 0, 'translated': 0, 'skipped': 0, 'failed': 0, 'english': 0}
counter_lock = Lock()
last_display = ""
interrupted = False

def update_counter(key, value=1):
    global last_display
    with counter_lock:
        counters[key] += value
        display_str = (f"Total: {counters['total']} | Translated: {counters['translated']} | "
                       f"English: {counters['english']} | Skipped: {counters['skipped']} | Failed: {counters['failed']}")
        if display_str != last_display:
            print("\r" + " " * len(last_display), end="", flush=True)
            print("\r" + display_str, end="", flush=True)
            last_display = display_str

# Signal handling for interrupts
def handle_interrupt(signal_received, frame):
    global interrupted
    print("\n🛑 Interrupt received. Saving progress...")
    interrupted = True
    with open(CHECKPOINT_PATH, "w") as cp:
        cp.write(str(counters['total']))
    sys.exit(0)

signal.signal(signal.SIGINT, handle_interrupt)

# Load model
print("⚡ Loading translation model...")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en").to(DEVICE)
model.eval()

def mps_translate(text, max_length=512):
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        return text
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        outputs = model.generate(**inputs)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"\nModel translation error: {str(e)}")
        return text

def hybrid_translate(text):
    update_counter('total')
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        update_counter('skipped')
        return text
    try:
        if detect(text) == 'en':
            update_counter('english')
            return text
    except LangDetectException:
        pass
    translated = mps_translate(text)
    if translated != text:
        update_counter('translated')
        return translated
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        if translated:
            update_counter('translated')
            return translated
    except Exception as e:
        update_counter('failed')
        print(f"\nGoogle Translate failed: {str(e)}")
    return text

# Load original data
print("📂 Loading data...")
df = pd.read_csv(INPUT_PATH)

# Resume support
translated_rows = 0
if os.path.exists(OUTPUT_PATH):
    print("⏳ Resuming from saved progress...")
    translated_df = pd.read_csv(OUTPUT_PATH)
    if isinstance(translated_df, pd.DataFrame):
        translated_rows = len(translated_df)
if os.path.exists(CHECKPOINT_PATH):
    with open(CHECKPOINT_PATH, "r") as cp:
        try:
            translated_rows = int(cp.read().strip())
        except:
            translated_rows = 0

df = df.iloc[translated_rows:]

# Process and save row-by-row
print("\n🚀 Translating...")
with open(OUTPUT_PATH, 'a', encoding='utf-8', newline='') as f:
    for idx, row in tqdm(df.iterrows(), total=len(df), initial=translated_rows):
        subj, email, typ = row['subject'], row['email'], row.get('type', "")
        subj_en = hybrid_translate(subj)
        email_en = hybrid_translate(email)
        result_row = pd.DataFrame([[subj, subj_en, email, email_en, typ]],
                                  columns=['subject', 'subject_en', 'email', 'email_en', 'type'])
        result_row.to_csv(f, header=(idx == 0 and translated_rows == 0), index=False)

        # Update checkpoint
        with open(CHECKPOINT_PATH, "w") as cp:
            cp.write(str(translated_rows + idx + 1))

print("\n✅ Translation complete!")
print("Final counts:")
for k, v in counters.items():
    print(f"{k.title()}: {v}")

# Clean up checkpoint
if os.path.exists(CHECKPOINT_PATH):
    os.remove(CHECKPOINT_PATH)


⚡ Loading translation model...
📂 Loading data...
⏳ Resuming from saved progress...


EmptyDataError: No columns to parse from file

In [27]:
import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
from tqdm import tqdm
import torch
from threading import Lock
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

# Paths
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_with_subjects.csv"
OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_translated2.csv"
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Counters
counters = {'total': 0, 'translated': 0, 'skipped': 0, 'failed': 0, 'english': 0}
counter_lock = Lock()
last_display = ""

def update_counter(key, value=1):
    global last_display
    with counter_lock:
        counters[key] += value
        display_str = (f"Total: {counters['total']} | Translated: {counters['translated']} | "
                       f"English: {counters['english']} | Skipped: {counters['skipped']} | Failed: {counters['failed']}")
        if display_str != last_display:
            print("\r" + " " * len(last_display), end="", flush=True)
            print("\r" + display_str, end="", flush=True)
            last_display = display_str

# Load model
print("⚡ Loading translation model...")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en").to(DEVICE)
model.eval()

def mps_translate(text, max_length=512):
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        return text
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        outputs = model.generate(**inputs)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"\nModel translation error: {str(e)}")
        return text

def hybrid_translate(text):
    update_counter('total')
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        update_counter('skipped')
        return text
    try:
        if detect(text) == 'en':
            update_counter('english')
            return text
    except LangDetectException:
        pass
    translated = mps_translate(text)
    if translated != text:
        update_counter('translated')
        return translated
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        if translated:
            update_counter('translated')
            return translated
    except Exception as e:
        update_counter('failed')
        print(f"\nGoogle Translate failed: {str(e)}")
    return text

# Load original data
print("📂 Loading data...")
df = pd.read_csv(INPUT_PATH)

# Resume support
translated_rows = 0  # default

if os.path.exists(OUTPUT_PATH):
    print("⏳ Resuming from saved progress...")
    translated_df = pd.read_csv(OUTPUT_PATH)
    
    # 👇 Sanity check: make sure it's a DataFrame
    if isinstance(translated_df, pd.DataFrame):
        translated_rows = len(translated_df)
    else:
        print("⚠️ Warning: Failed to read translated CSV properly.")
        translated_rows = 0  # fallback
else:
    translated_df = pd.DataFrame(columns=['subject', 'subject_en', 'email', 'email_en', 'type'])

# Process and save row-by-row
print("\n🚀 Translating...")
with open(OUTPUT_PATH, 'a', encoding='utf-8', newline='') as f:
    for idx, row in tqdm(df.iterrows(), total=int(len(df)), initial=int(translated_rows)):
        subj, email, typ = row['subject'], row['email'], row.get('type', "")
        subj_en = hybrid_translate(subj)
        email_en = hybrid_translate(email)
        result_row = pd.DataFrame([[subj, subj_en, email, email_en, typ]],
                                  columns=['subject', 'subject_en', 'email', 'email_en', 'type'])
        result_row.to_csv(f, header=(idx == translated_rows and translated_rows == 0), index=False)

print("\n✅ Translation complete!")
print("Final counts:")
for k, v in counters.items():
    print(f"{k.title()}: {v}")

⚡ Loading translation model...
📂 Loading data...
⏳ Resuming from saved progress...


EmptyDataError: No columns to parse from file

In [34]:
import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
from tqdm.auto import tqdm
import torch
from threading import Lock
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Configuration
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/foreign_language_emails.csv"
SUBJECT_OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/subjects_translated.csv"
EMAIL_OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_translated.csv"
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"
DEVICE = torch.device("cpu") #if torch.backends.mps.is_available() else torch.device("cpu")

# Initialize model
print("⚡ Loading translation model...")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en").to(DEVICE)

# Counters
counters = {'total': 0, 'translated': 0, 'skipped': 0, 'failed': 0, 'english': 0}
counter_lock = Lock()

def update_counter(key, value=1):
    with counter_lock:
        counters[key] += value
        print(f"\rTotal: {counters['total']} | "
              f"Translated: {counters['translated']} | "
              f"English: {counters['english']} | "
              f"Skipped: {counters['skipped']} | "
              f"Failed: {counters['failed']}", end="", flush=True)

def mps_translate(text, max_length=512):
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        return text
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        outputs = model.generate(**inputs)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"\nModel translation error: {str(e)}")
        return text

def translate_text(text):
    update_counter('total')
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        update_counter('skipped')
        return text
    try:
        if detect(text) == 'en':
            update_counter('english')
            return text
    except LangDetectException:
        pass
    translated = mps_translate(text)
    if translated != text:
        update_counter('translated')
        return translated
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        if translated:
            update_counter('translated')
            return translated
    except Exception as e:
        update_counter('failed')
        print(f"\nGoogle Translate failed: {str(e)}")
    return text

# Read data
print("\n📂 Loading data...")
df = pd.read_csv(INPUT_PATH)

# Translate subject
print("\n🔤 Translating subject column...")
counters = dict.fromkeys(counters, 0)
tqdm.pandas(desc="Translating subjects")
df['subject_en'] = df['subject'].progress_apply(translate_text)
df[['subject', 'subject_en', 'type']].to_csv(SUBJECT_OUTPUT_PATH, index=False)
print(f"\n✅ Subject translation complete. Saved to {SUBJECT_OUTPUT_PATH}")

# Translate email
print("\n📧 Translating email column...")
counters = dict.fromkeys(counters, 0)
tqdm.pandas(desc="Translating emails")
df['email_en'] = df['email'].progress_apply(translate_text)
df[['email', 'email_en', 'type']].to_csv(EMAIL_OUTPUT_PATH, index=False)
print(f"\n✅ Email translation complete. Saved to {EMAIL_OUTPUT_PATH}")


⚡ Loading translation model...

📂 Loading data...

🔤 Translating subject column...


Translating subjects:   0%|          | 0/8701 [00:00<?, ?it/s]

Total: 8701 | Translated: 7826 | English: 176 | Skipped: 699 | Failed: 0
✅ Subject translation complete. Saved to /Users/vighneshms/Downloads/Email_classifier/models/subjects_translated.csv

📧 Translating email column...


Translating emails:   0%|          | 0/8701 [00:00<?, ?it/s]

Total: 159 | Translated: 158 | English: 0 | Skipped: 0 | Failed: 0

KeyboardInterrupt: 

In [36]:
import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
from tqdm.auto import tqdm
import torch
from threading import Lock
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Configuration
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/foreign_language_emails.csv"
OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_translated.csv"
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"
DEVICE = torch.device("cpu") #if torch.backends.mps.is_available() else torch.device("cpu")

# Load translation model
print("⚡ Loading translation model...")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en").to(DEVICE)

# Counters
counters = {'total': 0, 'translated': 0, 'skipped': 0, 'failed': 0, 'english': 0}
counter_lock = Lock()

def update_counter(key, value=1):
    with counter_lock:
        counters[key] += value
        print(f"\rTotal: {counters['total']} | "
              f"Translated: {counters['translated']} | "
              f"English: {counters['english']} | "
              f"Skipped: {counters['skipped']} | "
              f"Failed: {counters['failed']}", end="", flush=True)

def mps_translate(text, max_length=512):
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        return text
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(DEVICE)
        outputs = model.generate(**inputs)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"\nModel translation error: {str(e)}")
        return text

def translate_text(text):
    update_counter('total')
    if not isinstance(text, str) or not text.strip() or text == MISSING_SUBJECT_TOKEN:
        update_counter('skipped')
        return text
    try:
        if detect(text) == 'en':
            update_counter('english')
            return text
    except LangDetectException:
        pass
    translated = mps_translate(text)
    if translated != text:
        update_counter('translated')
        return translated
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        if translated:
            update_counter('translated')
            return translated
    except Exception as e:
        update_counter('failed')
        print(f"\nGoogle Translate failed: {str(e)}")
    return text

# Load data with translated subjects
print("\n📂 Loading data...")
df = pd.read_csv(INPUT_PATH)

# Translate email
print("\n📧 Translating email column...")
tqdm.pandas(desc="Translating emails")
df['email_en'] = df['email'].progress_apply(translate_text)

# Save final output
output_cols = ['subject', 'subject_en', 'email', 'email_en', 'type']
df.to_csv(OUTPUT_PATH, index=False)

print(f"\n✅ Email translation complete. Saved to {OUTPUT_PATH}")
print("Final counts:")
for k, v in counters.items():
    print(f"{k.title()}: {v}")


⚡ Loading translation model...

📂 Loading data...

📧 Translating email column...


Translating emails:   0%|          | 0/8701 [00:00<?, ?it/s]

Total: 735 | Translated: 734 | English: 0 | Skipped: 0 | Failed: 0

KeyboardInterrupt: 

In [30]:
import pandas as pd
from langdetect import detect, LangDetectException
from tqdm import tqdm

# Set seed for consistent language detection (important for reproducibility)
from langdetect import DetectorFactory
DetectorFactory.seed = 0

def detect_language(text):
    """Enhanced language detection with error handling"""
    try:
        if pd.isna(text) or not text.strip():
            return 'unknown'
        lang = detect(text)
        return lang if lang != 'en' else 'en'  # Explicit English marking
    except LangDetectException:
        return 'unknown'

# Load your dataset
input_path = "/Users/vighneshms/Downloads/Email_classifier/models/emails_with_subjects.csv"
df = pd.read_csv(input_path)

# Detect languages with progress bar
print("Detecting languages...")
tqdm.pandas(desc="Processing emails")
df['language'] = df['email'].progress_apply(detect_language)

# Filter non-English emails (change 'en' to include other languages you consider foreign)
foreign_langs = df[df['language'] != 'en']

# Save foreign language emails to new CSV
output_path = "/Users/vighneshms/Downloads/Email_classifier/models/foreign_language_emails.csv"
foreign_langs.to_csv(output_path, index=False)

# Print summary
print("\nLanguage Distribution:")
print(df['language'].value_counts())
print(f"\nSaved {len(foreign_langs)} foreign language emails to: {output_path}")
print("Sample foreign language emails:")
print(foreign_langs[['email', 'language']].head(3))

Detecting languages...


Processing emails: 100%|██████████| 24000/24000 [00:57<00:00, 416.67it/s]



Language Distribution:
language
en    15299
de     6922
es      812
fr      484
pt      474
nl        8
it        1
Name: count, dtype: int64

Saved 8701 foreign language emails to: /Users/vighneshms/Downloads/Email_classifier/models/foreign_language_emails.csv
Sample foreign language emails:
                                               email language
0  Subject: Unvorhergesehener Absturz der Datenan...       de
3  Subject: Krankenhaus-Dienstleistung-Problem\n\...       de
6  Subject: Ratung für Sicherung medizinischer Da...       de


In [None]:
import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import os
import signal
import sys
from functools import partial

# Configuration
INPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/foreign_language_emails.csv"
OUTPUT_PATH = "/Users/vighneshms/Downloads/Email_classifier/models/emails_translated.csv"
PROGRESS_PATH = OUTPUT_PATH.replace(".csv", "_progress.csv")
MISSING_SUBJECT_TOKEN = "[NO_SUBJECT]"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
BATCH_SIZE = 64 if "cuda" in str(DEVICE) else 32  # Larger batches for GPU
MAX_WORKERS = 8 if "cuda" in str(DEVICE) else 4   # More workers for GPU
TRANSLATION_TIMEOUT = 30  # seconds

# Track if we need to save progress on interrupt
interrupted = False

def handle_interrupt(signum, frame):
    global interrupted
    interrupted = True
    print("\n⚠️  Interrupt received. Saving progress before exiting...")
    sys.exit(1)

signal.signal(signal.SIGINT, handle_interrupt)

# Load model with optimized settings
print("⚡ Loading translation model with optimized settings...")
tokenizer = AutoTokenizer.from_pretrained(
    "Helsinki-NLP/opus-mt-mul-en",
    device_map="auto",
    torch_dtype=torch.float16 if DEVICE.type == "cuda" else torch.float32
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    "Helsinki-NLP/opus-mt-mul-en",
    device_map="auto",
    torch_dtype=torch.float16 if DEVICE.type == "cuda" else torch.float32
).eval()

@torch.inference_mode()
def batch_translate(texts):
    """Optimized batch translation with GPU memory management"""
    try:
        inputs = tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(DEVICE)
        
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_beams=4,  # Good balance between speed and quality
            early_stopping=True
        )
        return tokenizer.batch_decode(outputs, skip_special_tokens=True)
    except Exception as e:
        print(f"\n⚠️ Batch translation error: {str(e)}")
        return None

def is_english(text):
    """Fast language detection with caching"""
    if not isinstance(text, str) or not text.strip():
        return True
    try:
        return detect(text) == 'en'
    except:
        return False

def translate_with_fallback(text):
    """Hybrid translation with timeout"""
    if not text or text == MISSING_SUBJECT_TOKEN or is_english(text):
        return text
    
    try:
        # First try Google Translate (faster for short texts)
        translated = GoogleTranslator(
            source='auto',
            target='en',
            timeout=TRANSLATION_TIMEOUT
        ).translate(text)
        if translated:
            return translated
    except:
        pass
    
    # Fallback to model translation
    try:
        result = batch_translate([text])
        return result[0] if result else text
    except:
        return text

def process_email(email_data):
    """Process single email with all optimizations"""
    email, row_id = email_data
    if not isinstance(email, str) or not email.strip():
        return row_id, email
    
    return row_id, translate_with_fallback(email)

def main():
    global interrupted
    
    print("\n📂 Loading data...")
    df = pd.read_csv(INPUT_PATH)
    
    # Initialize progress tracking
    if os.path.exists(PROGRESS_PATH):
        progress_df = pd.read_csv(PROGRESS_PATH)
        translated_emails = progress_df.set_index('id')['email_en'].to_dict()
    else:
        translated_emails = {}
        progress_df = pd.DataFrame(columns=['id', 'email_en'])
    
    # Prepare email batches
    emails_to_process = [
        (row['email'], idx)
        for idx, row in df.iterrows()
        if idx not in translated_emails
    ]
    
    print(f"\n🌍 Translating {len(emails_to_process)} emails...")
    
    # Process emails in parallel with progress tracking
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {
            executor.submit(process_email, email_data): email_data[1]
            for email_data in emails_to_process
        }
        
        try:
            for future in tqdm(
                as_completed(futures),
                total=len(futures),
                desc="Translating emails",
                unit="email"
            ):
                row_id, translated = future.result()
                translated_emails[row_id] = translated
                
                # Periodically save progress
                if len(translated_emails) % 100 == 0:
                    pd.DataFrame({
                        'id': list(translated_emails.keys()),
                        'email_en': list(translated_emails.values())
                    }).to_csv(PROGRESS_PATH, index=False)
                
                if interrupted:
                    break
                    
        except Exception as e:
            print(f"\n⚠️ Error during processing: {str(e)}")
            interrupted = True
    
    # Apply translations to dataframe
    df['email_en'] = df.index.map(lambda x: translated_emails.get(x, df.loc[x, 'email']))
    
    # Save final output
    output_cols = ['email', 'email_en', 'type']
    df[output_cols].to_csv(OUTPUT_PATH, index=False)
    
    # Clean up progress file if completed
    if not interrupted and os.path.exists(PROGRESS_PATH):
        os.remove(PROGRESS_PATH)
    
    print(f"\n✅ {'Completed' if not interrupted else 'Partially completed'}!")
    print(f"Output saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"\n❌ Critical error: {str(e)}")
        if os.path.exists(PROGRESS_PATH):
            print(f"Progress saved to: {PROGRESS_PATH}")
        sys.exit(1)