In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from datasets import Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import re
import nltk
from nltk.corpus import stopwords
import time
import os
import gc

# Download stopwords if not present
nltk.download('stopwords')

# Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
print(f"--> Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


--> Using device: cuda


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\d3lus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# 1. Load Data
print("--> Loading data...")
try:
    df_pd = pd.read_parquet('aita_train.parquet')
except FileNotFoundError:
    # Dummy data for testing if file missing
    print("    Warning: File not found. Generating Dummy Data.")
    df_pd = pd.DataFrame({
        'title': ['My neighbor is loud'] * 100 + ['I donated to charity'] * 100,
        'text': ['He plays drums all night.'] * 100 + ['It felt good to help.'] * 100,
        'verdict': ['yta'] * 100 + ['nta'] * 100
    })

# 2. Basic Preprocessing
df_pd['title'] = df_pd['title'].fillna('')
df_pd['text'] = df_pd['text'].fillna('')
df_pd['full_text_raw'] = df_pd['title'] + " " + df_pd['text']

# Map Labels
label_map = {'nta': 0, 'nah': 0, 'yta': 1, 'esh': 1}
df_pd['label'] = df_pd['verdict'].map(label_map)
df_pd = df_pd.dropna(subset=['label'])
df_pd['label'] = df_pd['label'].astype(int)

# 3. PROFESSOR FEEDBACK: UNDERSAMPLING
# Balance the dataset to ~18k rows total (9k each) to make training feasible
print("--> Balancing Data...")
df_yta = df_pd[df_pd['label'] == 1]
df_nta = df_pd[df_pd['label'] == 0]

# Undersample majority class
if len(df_nta) > len(df_yta):
    df_nta = df_nta.sample(n=len(df_yta), random_state=42)
else:
    df_yta = df_yta.sample(n=len(df_nta), random_state=42)

df_balanced = pd.concat([df_yta, df_nta]).sample(frac=1, random_state=42).reset_index(drop=True)
print(f"    Balanced Dataset Size: {len(df_balanced)}")

# 4. PROFESSOR FEEDBACK: STOP WORD REMOVAL (For LDA Only)
print("--> Creating Cleaned Text for LDA...")
stop_words = set(stopwords.words('english'))

def clean_text_for_lda(text):
    # Lowercase, remove non-alpha, remove stopwords
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return " ".join(words)

# Apply cleaning (This might take a few seconds)
df_balanced['lda_text'] = df_balanced['full_text_raw'].apply(clean_text_for_lda)
print("    Data preparation complete.")

--> Loading data...
--> Balancing Data...
    Balanced Dataset Size: 18434
--> Creating Cleaned Text for LDA...
    Data preparation complete.


In [3]:
# --- PROFESSOR FEEDBACK: LDA TOPIC MODELING ---
print("--> Running LDA Topic Modeling...")

# Parameters
N_TOPICS_PER_CLASS = 10 # You will get 10 features from YTA model + 10 from NTA model = 20 total extra features

# Split data for LDA training (We only train LDA on the training set portion to avoid leakage!)
train_df_temp, _ = train_test_split(df_balanced, test_size=0.15, stratify=df_balanced['label'], random_state=42)

# Separate documents by class
docs_nta = train_df_temp[train_df_temp['label'] == 0]['lda_text']
docs_yta = train_df_temp[train_df_temp['label'] == 1]['lda_text']

print(f"    Training LDA Model 1 on NTA data ({len(docs_nta)} docs)...")
# Vectorize
vectorizer_nta = CountVectorizer(max_features=2000, max_df=0.9, min_df=5)
X_nta = vectorizer_nta.fit_transform(docs_nta)
# Train LDA
lda_nta = LatentDirichletAllocation(n_components=N_TOPICS_PER_CLASS, random_state=42, n_jobs=-1)
lda_nta.fit(X_nta)

print(f"    Training LDA Model 2 on YTA data ({len(docs_yta)} docs)...")
# Vectorize
vectorizer_yta = CountVectorizer(max_features=2000, max_df=0.9, min_df=5)
X_yta = vectorizer_yta.fit_transform(docs_yta)
# Train LDA
lda_yta = LatentDirichletAllocation(n_components=N_TOPICS_PER_CLASS, random_state=42, n_jobs=-1)
lda_yta.fit(X_yta)

print("--> Generating Topic Features for entire dataset...")

# Helper function to get topic distribution
def get_lda_features_dual(text_series):
    # Stream 1: How much does this look like NTA topics?
    vec_nta = vectorizer_nta.transform(text_series)
    topics_nta = lda_nta.transform(vec_nta) # Shape (n_samples, 10)
    
    # Stream 2: How much does this look like YTA topics?
    vec_yta = vectorizer_yta.transform(text_series)
    topics_yta = lda_yta.transform(vec_yta) # Shape (n_samples, 10)
    
    # Combine: Shape (n_samples, 20)
    return np.hstack([topics_nta, topics_yta])

# Calculate features for the whole dataframe
lda_features_all = get_lda_features_dual(df_balanced['lda_text'])

# Add to dataframe (store as list or numpy array column is tricky, so we keep it separate index-aligned)
print(f"    LDA Features Shape: {lda_features_all.shape}")

--> Running LDA Topic Modeling...
    Training LDA Model 1 on NTA data (7834 docs)...
    Training LDA Model 2 on YTA data (7834 docs)...
--> Generating Topic Features for entire dataset...
    LDA Features Shape: (18434, 20)


In [4]:
# Create a temporary 'view' to inspect the LDA features alongside the text
df_inspection = df_balanced.copy()

# 1. Create column names for the 20 topics (10 from NTA model, 10 from YTA model)
topic_cols = [f"NTA_Topic_{i+1}" for i in range(10)] + [f"YTA_Topic_{i+1}" for i in range(10)]

# 2. Add the LDA probabilities as columns
df_lda_probs = pd.DataFrame(lda_features_all, columns=topic_cols)
df_inspection = pd.concat([df_inspection, df_lda_probs], axis=1)

# 3. View the text alongside its topic probabilities
print("--> Sample of Data with LDA Features:")
# We select just the text, the verdict, and the first few topic columns to display
cols_to_show = ['verdict', 'lda_text', 'NTA_Topic_1', 'NTA_Topic_2', 'YTA_Topic_1', 'YTA_Topic_2']
print(df_inspection[cols_to_show].head(5))

# Optional: Check if a specific row has high probability for a specific topic
print("\n--> Detailed view of the first row:")
print(df_inspection.iloc[0][topic_cols])

--> Sample of Data with LDA Features:
  verdict                                           lda_text  NTA_Topic_1  \
0     yta  aita dont say sounds fun sarcastically basical...     0.000532   
1     nta  wibta hid future income parents live fathermd ...     0.000417   
2     esh  aita handled work conflict year old trans man ...     0.105370   
3     nta  aita called stubborn wanting help chores boyfr...     0.106076   
4     yta  aita telling return cats recently adopted two ...     0.000538   

   NTA_Topic_2  YTA_Topic_1  YTA_Topic_2  
0     0.000532     0.000535     0.000535  
1     0.000417     0.000419     0.045794  
2     0.000495     0.131215     0.221067  
3     0.000585     0.105307     0.000599  
4     0.000538     0.000538     0.107960  

--> Detailed view of the first row:
NTA_Topic_1     0.000532
NTA_Topic_2     0.000532
NTA_Topic_3     0.771054
NTA_Topic_4     0.000532
NTA_Topic_5     0.000532
NTA_Topic_6     0.000532
NTA_Topic_7     0.000532
NTA_Topic_8     0.000532
NTA_

In [5]:
def print_top_words(model, vectorizer, n_top_words=10, prefix="Topic"):
    feature_names = vectorizer.get_feature_names_out()
    print(f"--> Top words for {prefix} Models:")
    
    for topic_idx, topic in enumerate(model.components_):
        # Sort the words by importance (highest weight first)
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        
        print(f"   {prefix} #{topic_idx+1}: {', '.join(top_words)}")
    print("-" * 50)

# Print NTA Topics
print_top_words(lda_nta, vectorizer_nta, prefix="NTA Topic")

# Print YTA Topics
print_top_words(lda_yta, vectorizer_yta, prefix="YTA Topic")

--> Top words for NTA Topic Models:
   NTA Topic #1: husband, wife, baby, home, time, aita, told, food, house, said
   NTA Topic #2: family, wedding, birthday, party, want, would, dont, christmas, day, aita
   NTA Topic #3: friend, said, friends, didnt, told, aita, got, would, asked, like
   NTA Topic #4: money, pay, would, job, work, get, rent, help, aita, move
   NTA Topic #5: work, time, like, dont, get, aita, feel, know, one, would
   NTA Topic #6: told, said, mom, family, like, dad, mother, aita, didnt, dont
   NTA Topic #7: kids, sister, parents, dont, want, family, daughter, children, school, would
   NTA Topic #8: mom, dad, brother, car, got, get, told, parents, house, back
   NTA Topic #9: room, dog, house, get, home, aita, back, one, door, dont
   NTA Topic #10: like, said, dont, aita, something, really, one, get, didnt, say
--------------------------------------------------
--> Top words for YTA Topic Models:
   YTA Topic #1: mom, sister, told, wife, dad, said, didnt, parent

In [6]:
# 1. Prepare Split
train_idx, test_idx = train_test_split(np.arange(len(df_balanced)), test_size=0.15, stratify=df_balanced['label'], random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.15, stratify=df_balanced.iloc[train_idx]['label'], random_state=42)

# 2. Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 3. Custom Dataset Class
class HybridDataset(torch.utils.data.Dataset):
    def __init__(self, df, lda_features, indices, tokenizer, max_len=512):
        self.df = df.iloc[indices].reset_index(drop=True)
        self.lda_data = lda_features[indices] # Select corresponding LDA rows
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, item):
        row = self.df.iloc[item]
        text = row['full_text_raw']
        label = row['label']
        lda_vec = self.lda_data[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'lda_features': torch.tensor(lda_vec, dtype=torch.float), # The new input
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create Datasets
train_ds = HybridDataset(df_balanced, lda_features_all, train_idx, tokenizer)
val_ds = HybridDataset(df_balanced, lda_features_all, val_idx, tokenizer)

# Create Loaders (Keep Batch Size low for GPU safety)
BATCH_SIZE = 8
ACCUMULATION_STEPS = 2 

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

# 4. PROFESSOR FEEDBACK: ARCHITECTURE EXPERIMENTATION
# Custom Hybrid Model
class BertWithLDA(nn.Module):
    def __init__(self, n_lda_features):
        super(BertWithLDA, self).__init__()
        # Load base BERT model (outputs 768 dim vector)
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(p=0.3)
        
        # Final Classifier: BERT (768) + LDA (20) -> 2 Classes
        combined_dim = 768 + n_lda_features
        self.out = nn.Linear(combined_dim, 2)
        
    def forward(self, input_ids, attention_mask, lda_features):
        # 1. Feed text to BERT
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # pooler_output is the embedding of the [CLS] token (representation of whole sentence)
        pooled_output = bert_output.pooler_output 
        output = self.drop(pooled_output)
        
        # 2. Concatenate BERT output with LDA probabilities
        # output shape: [batch, 768], lda_features shape: [batch, 20]
        combined_output = torch.cat((output, lda_features), dim=1)
        
        # 3. Classify
        return self.out(combined_output)

print("--> Model Architecture defined.")

--> Model Architecture defined.


In [7]:
# Setup
n_lda_cols = lda_features_all.shape[1] # Should be 20
model = BertWithLDA(n_lda_features=n_lda_cols)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 3
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss()
scaler = GradScaler() # Mixed Precision

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    optimizer.zero_grad()
    
    for i, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        lda_features = d["lda_features"].to(device)
        targets = d["label"].to(device)

        with autocast():
            # Pass BOTH text and lda features
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                lda_features=lda_features
            )
            loss = loss_fn(outputs, targets)
            loss = loss / ACCUMULATION_STEPS

        scaler.scale(loss).backward()
        losses.append(loss.item() * ACCUMULATION_STEPS)
        
        # Calculate Accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)

        if (i + 1) % ACCUMULATION_STEPS == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            
        if i % 200 == 0:
            print(f"    Batch {i}/{len(data_loader)} loss: {loss.item() * ACCUMULATION_STEPS:.4f}")

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            lda_features = d["lda_features"].to(device)
            targets = d["label"].to(device)

            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    lda_features=lda_features
                )
                loss = loss_fn(outputs, targets)

            losses.append(loss.item())
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == targets)

    return correct_predictions.double() / n_examples, np.mean(losses)

# --- TRAINING ---
print(f"--> Starting Hybrid Training ({EPOCHS} Epochs)...")
history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}
total_start = time.time()

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, scheduler, len(train_ds)
    )
    print(f'  Train loss {train_loss:.4f} accuracy {train_acc:.4f}')

    val_acc, val_loss = eval_model(
        model, val_loader, loss_fn, device, len(val_ds)
    )
    print(f'  Val   loss {val_loss:.4f} accuracy {val_acc:.4f}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

print(f"--> Training Complete! Time: {(time.time() - total_start)/60:.2f} min")

# Save
output_dir = './saved_hybrid_bert'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
torch.save(model.state_dict(), os.path.join(output_dir, 'model_state.bin'))
print("Model saved.")

  scaler = GradScaler() # Mixed Precision
  with autocast():


--> Starting Hybrid Training (3 Epochs)...
Epoch 1/3
----------
    Batch 0/1665 loss: 0.6588
    Batch 200/1665 loss: 0.6919
    Batch 400/1665 loss: 0.5720
    Batch 600/1665 loss: 0.7745
    Batch 800/1665 loss: 0.7252
    Batch 1000/1665 loss: 0.5834
    Batch 1200/1665 loss: 0.7769
    Batch 1400/1665 loss: 0.5355
    Batch 1600/1665 loss: 0.4741
  Train loss 0.6696 accuracy 0.5886


  with autocast():


  Val   loss 0.6317 accuracy 0.6482

Epoch 2/3
----------
    Batch 0/1665 loss: 0.5981
    Batch 200/1665 loss: 0.7601
    Batch 400/1665 loss: 0.6822
    Batch 600/1665 loss: 0.3750
    Batch 800/1665 loss: 0.6796
    Batch 1000/1665 loss: 0.5126
    Batch 1200/1665 loss: 0.5642
    Batch 1400/1665 loss: 0.7215
    Batch 1600/1665 loss: 0.3960
  Train loss 0.6002 accuracy 0.6777
  Val   loss 0.6614 accuracy 0.6521

Epoch 3/3
----------
    Batch 0/1665 loss: 0.4534
    Batch 200/1665 loss: 0.1600
    Batch 400/1665 loss: 0.4600
    Batch 600/1665 loss: 0.4797
    Batch 800/1665 loss: 0.3385
    Batch 1000/1665 loss: 0.4392
    Batch 1200/1665 loss: 1.0580
    Batch 1400/1665 loss: 0.2474
    Batch 1600/1665 loss: 0.2603
  Train loss 0.4814 accuracy 0.7737
  Val   loss 0.6898 accuracy 0.6601

--> Training Complete! Time: 25.55 min
Model saved.


In [8]:
from collections import Counter
import torch
import numpy as np

def check_baseline_performance(dataloader, model_accuracy):
    print("--> Calculating Majority Class Baseline (PCC)...")
    
    # 1. Collect all actual labels from the validation set
    all_labels = []
    for batch in dataloader:
        targets = batch['label'] # Assuming your dict key is 'label'
        all_labels.extend(targets.cpu().numpy())
    
    # 2. Determine the majority class
    counts = Counter(all_labels)
    majority_class_label = max(counts, key=counts.get)
    majority_count = counts[majority_class_label]
    total_samples = len(all_labels)
    
    # 3. Calculate Baseline Accuracy (ZeroR)
    # This is the accuracy if we just guessed the most common label every time
    baseline_acc = majority_count / total_samples
    
    print(f"    Total Validation Samples: {total_samples}")
    print(f"    Class Distribution: {dict(counts)}")
    print(f"    Majority Class: {majority_class_label} (Count: {majority_count})")
    print("-" * 30)
    print(f"    Baseline Accuracy to Beat: {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
    print(f"    Your Model Accuracy:       {model_accuracy:.4f} ({model_accuracy*100:.2f}%)")
    print("-" * 30)
    
    # 4. Final Verdict
    if model_accuracy > baseline_acc:
        print("✅ SUCCESS: The model has beaten the majority class baseline.")
        print(f"   Improvement over baseline: +{(model_accuracy - baseline_acc)*100:.2f}%")
    else:
        print("❌ WARNING: The model did not beat the baseline.")
        print("   Consider training longer or adjusting hyperparameters.")

# Run the check
# 'val_loader' is your validation data loader
# 'val_acc' is the final accuracy variable from your training loop
check_baseline_performance(val_loader, val_acc)

--> Calculating Majority Class Baseline (PCC)...
    Total Validation Samples: 2351
    Class Distribution: {np.int64(0): 1176, np.int64(1): 1175}
    Majority Class: 0 (Count: 1176)
------------------------------
    Baseline Accuracy to Beat: 0.5002 (50.02%)
    Your Model Accuracy:       0.6601 (66.01%)
------------------------------
✅ SUCCESS: The model has beaten the majority class baseline.
   Improvement over baseline: +15.99%
