In [None]:
import sys
import os

# This adds your project's 'src' folder to the Python path
# It goes up one level ('..') from 'notebooks' and then into 'src'
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [1]:
# Use Kaggle's pre-installed packages - no custom installation needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Standard ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix

In [3]:

# Check what's available in Kaggle environment
try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
except ImportError:
    print("PyTorch not available - using CPU fallback")
    device = "cpu"

try:
    from transformers import (
        DistilBertTokenizer, 
        DistilBertForSequenceClassification,
        TrainingArguments, 
        Trainer,
    )
    print("Transformers library loaded successfully")
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    print("Transformers not available - will use alternative approach")
    TRANSFORMERS_AVAILABLE = False
    


PyTorch version: 2.6.0+cu124
Using device: cuda
GPU: Tesla T4


2025-09-25 12:48:24.931143: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758804505.250653      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758804505.339990      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Transformers library loaded successfully


In [None]:
# ============================================================================
# STEP 1: LOAD DATASET (KAGGLE FRIENDLY)
# ============================================================================

import pandas as pd

# Import the specific variable you want from your config
from config import CLEANED_DATA_FILE
import pandas as pd

# Load the data using the imported path
df_cleaned = pd.read_excel(CLEANED_DATA_FILE)

# Display the first few rows to check
df_cleaned.head()

print("Shape:", df.shape)
print(df.head())


Shape: (1113992, 2)
                                               label  \
0                        Checking or savings account   
1                                    Debt collection   
2  Credit reporting, credit repair services, or o...   
3  Credit reporting, credit repair services, or o...   
4                                           Mortgage   

                                      complaint_text  
0  Hi, I have been banking with Wells Fargo for o...  
1  XXXX is attempting to collect funds for Valuat...  
2  Today I called to get my balance and reset my ...  
3  The Federal Trade Commission Bureau of Consume...  
4  We applied for a home loan using agent XXXX XX...  


In [5]:
# ============================================================================
# STEP 2: LOAD AND PREPARE DATA
# ============================================================================

# Load dataset
df = pd.read_csv(DATASET_PATH)
print(f"Dataset loaded: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Basic data cleaning
if 'complaint_text' not in df.columns or 'label' not in df.columns:
    print("Column names found:")
    for i, col in enumerate(df.columns):
        print(f"  {i}: {col}")
    
    # Try to auto-detect text and label columns
    text_col = None
    label_col = None
    
    for col in df.columns:
        if 'text' in col.lower() or 'complaint' in col.lower() or 'narrative' in col.lower():
            text_col = col
        elif 'label' in col.lower() or 'category' in col.lower() or 'class' in col.lower():
            label_col = col
    
    if text_col and label_col:
        df = df.rename(columns={text_col: 'complaint_text', label_col: 'label'})
        print(f"Auto-detected: text='{text_col}', label='{label_col}'")
    else:
        raise ValueError("Could not find complaint_text and label columns")

# Remove missing values
df = df.dropna(subset=['complaint_text', 'label']).reset_index(drop=True)
print(f"After removing nulls: {df.shape}")

# Show label distribution
print("\nLabel distribution:")
label_counts = df['label'].value_counts()
print(label_counts.head(10))

# Remove very small classes
min_samples = 100
small_classes = label_counts[label_counts < min_samples].index.tolist()
if small_classes:
    print(f"Removing {len(small_classes)} small classes")
    df = df[~df['label'].isin(small_classes)].reset_index(drop=True)

print(f"Final dataset: {df.shape}")

Dataset loaded: (1113992, 2)
Columns: ['label', 'complaint_text']
After removing nulls: (1113992, 2)

Label distribution:
label
Credit reporting, credit repair services, or other personal consumer reports    515503
Debt collection                                                                 192045
Mortgage                                                                         97783
Credit card or prepaid card                                                      81866
Checking or savings account                                                      54264
Student loan                                                                     32713
Credit reporting                                                                 31588
Money transfer, virtual currency, or money service                               26578
Vehicle loan or lease                                                            19886
Credit card                                                                      18838
Na

In [6]:
# ============================================================================
# STEP 3: SAMPLE TOP 5 CLASSES, 10k SAMPLES EACH
# ============================================================================

import pandas as pd
import numpy as np

def sample_top_n_classes(df, top_n=5, samples_per_class=10000, random_state=42):
    """
    Take top N classes and sample a fixed number of rows per class.
    """
    np.random.seed(random_state)
    samples = []

    # Get top N classes by count
    top_classes = df['label'].value_counts().head(top_n).index.tolist()
    print(f"Top {top_n} classes: {top_classes}")

    for label in top_classes:
        label_df = df[df['label'] == label]
        count = len(label_df)
        # Sample with replacement if not enough rows
        if count >= samples_per_class:
            sampled = label_df.sample(n=samples_per_class, random_state=random_state)
        else:
            sampled = label_df.sample(n=samples_per_class, replace=True, random_state=random_state)

        samples.append(sampled)
        print(f"  {label}: {count} -> {samples_per_class}")

    # Combine and shuffle
    sample_df = pd.concat(samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
    print(f"Sampled dataset created: {len(sample_df)} rows")
    return sample_df

# Create the sampled dataset
sample_df = sample_top_n_classes(df, top_n=5, samples_per_class=5000)


Top 5 classes: ['Credit reporting, credit repair services, or other personal consumer reports', 'Debt collection', 'Mortgage', 'Credit card or prepaid card', 'Checking or savings account']
  Credit reporting, credit repair services, or other personal consumer reports: 515503 -> 5000
  Debt collection: 192045 -> 5000
  Mortgage: 97783 -> 5000
  Credit card or prepaid card: 81866 -> 5000
  Checking or savings account: 54264 -> 5000
Sampled dataset created: 25000 rows


In [7]:
import torch
from transformers import MarianMTModel, MarianTokenizer


In [8]:
# ============================================================================  
# STEP 4: LOAD HELSINKI NLP TRANSLATION MODEL (EN -> HI)  
# ============================================================================  

model_name = "Helsinki-NLP/opus-mt-en-hi"
print(f"Loading translation model: {model_name}")

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model.to(device)

Loading translation model: Helsinki-NLP/opus-mt-en-hi


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61950, 512, padding_idx=61949)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61950, 512, padding_idx=61949)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [9]:
# ============================================================================  
# STEP 5: TRANSLATION FUNCTIONS  
# ============================================================================  

def translate_text_to_hindi(text, max_length=512):
    """Translate single English text to Hindi"""
    try:
        text = str(text).strip()
        if len(text) == 0:
            return "शिकायत"
        if len(text) > 400:
            text = text[:400] + "..."
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
        hindi_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return hindi_text.strip() if hindi_text else f"शिकायत: {text[:50]}..."
    except:
        return f"शिकायत: {text[:50]}..."

def translate_batch_to_hindi(texts, batch_size=16):
    """Translate batch of texts"""
    all_translations = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating"):
        batch = texts[i:i + batch_size]
        try:
            clean_batch = [str(t).strip()[:400] + "..." if len(str(t)) > 400 else str(t).strip() for t in batch]
            inputs = tokenizer(clean_batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
            batch_translations = [tokenizer.decode(o, skip_special_tokens=True).strip() for o in outputs]
            all_translations.extend(batch_translations)
        except:
            for t in batch:
                all_translations.append(translate_text_to_hindi(t))
    return all_translations

def translate_sample_dataset(sample_df, batch_size=16):
    """Translate only the sample dataset"""
    print(f"Translating sample dataset ({len(sample_df):,} rows)...")
    complaint_texts = sample_df['complaint_text'].tolist()
    hindi_translations = translate_batch_to_hindi(complaint_texts, batch_size=batch_size)
    sample_df_with_hindi = sample_df.copy()
    sample_df_with_hindi['complaint_text_hindi'] = hindi_translations
    print("Translation completed!")
    return sample_df_with_hindi

In [10]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [11]:
import time

# Take a small sample (e.g., 100 texts)
sample_texts = sample_df['complaint_text'].tolist()[:10]

start_time = time.time()
_ = translate_batch_to_hindi(sample_texts, batch_size=16)
end_time = time.time()

time_per_text = (end_time - start_time) / len(sample_texts)
total_texts = len(sample_df)

estimated_total_time_sec = time_per_text * total_texts
estimated_total_time_min = estimated_total_time_sec / 60
estimated_total_time_hr = estimated_total_time_min / 60

print(f"Time per text: {time_per_text:.2f} sec")
print(f"Estimated total translation time: {estimated_total_time_min:.2f} min ({estimated_total_time_hr:.2f} hr)")



Translating:   0%|          | 0/1 [00:00<?, ?it/s]

Time per text: 0.34 sec
Estimated total translation time: 141.13 min (2.35 hr)


In [12]:
# ============================================================================  
# STEP 6: TRANSLATE AND SAVE  
# ============================================================================  

sample_df_with_hindi = translate_sample_dataset(sample_df, batch_size=16)
sample_df_with_hindi.to_csv("sample_top5_translated.csv", index=False)
print("Saved translated sample dataset to 'sample_top5_translated.csv'")

Translating sample dataset (25,000 rows)...


Translating:   0%|          | 0/1563 [00:00<?, ?it/s]

Translation completed!
Saved translated sample dataset to 'sample_top5_translated.csv'


In [13]:
from IPython.display import FileLink
FileLink("sample_top5_translated.csv")


In [None]:
# Emergency stop & save your data
import os, signal
import pandas as pd

# Attempt to stop all child Python processes
os.kill(os.getpid(), signal.SIGINT)

# Immediately save your current data (20k samples) safely
try:
    sample_df_with_hindi.to_csv("sample_top5_translated.csv", index=False)
    print("Saved 20k samples to 'sample_top5_translated.csv'")
except:
    print("Could not save - maybe kernel is too busy. Try saving after stopping the loop manually.")
