# Evolver Loop 5 Analysis: Why LDA Topic Modeling Hurt Performance

**Goal**: Analyze why exp_005 (LDA topics) scored 0.6505 (-0.0155 vs exp_004) and determine next steps.

**Key questions**:
1. Are LDA topics redundant with existing TF-IDF + SVD features?
2. Do topic probabilities add noise rather than signal?
3. Should we tune LDA hyperparameters or remove it entirely?
4. What should be our next priority: Stanford linguistic cues or something else?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
df_train = pd.read_csv('/home/data/train.csv')
df_test = pd.read_csv('/home/data/test.csv')
y = df_train['requester_received_pizza']

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive rate: {y.mean():.3f}")
print(f"Best CV so far: 0.6660 (exp_004)")
print(f"LDA experiment CV: 0.6505 (exp_005)")
print(f"Performance drop: -0.0155")

## 1. Analyze LDA Topic Quality and Redundancy

In [None]:
# Load exp_005 predictions to analyze
import joblib

# Try to load the experiment data
exp_005_path = '/home/code/experiments/005_lda_topic_modeling'

# Let's manually recreate the LDA pipeline to analyze topic quality
# Use the same preprocessing as exp_005

# Clean and combine text
def clean_text(text):
    if pd.isna(text):
        return ""
    return str(text).lower().strip()

df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text_edit_aware'].fillna('')
df_train['combined_text_clean'] = df_train['combined_text'].apply(clean_text)

# Create TF-IDF (same as exp_005)
word_tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=3,
    max_df=0.9
)

char_tfidf = TfidfVectorizer(
    analyzer='char',
    max_features=3000,
    ngram_range=(2, 4),
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Fit on full data for analysis (not for CV)
word_features = word_tfidf.fit_transform(df_train['combined_text_clean'])
char_features = char_tfidf.fit_transform(df_train['combined_text_clean'])

print(f"Word TF-IDF shape: {word_features.shape}")
print(f"Char TF-IDF shape: {char_features.shape}")

# Apply SVD (same as exp_005)
svd_word = TruncatedSVD(n_components=50, random_state=42)
svd_char = TruncatedSVD(n_components=25, random_state=42)

word_svd = svd_word.fit_transform(word_features)
char_svd = svd_char.fit_transform(char_features)

print(f"Word SVD shape: {word_svd.shape}")
print(f"Char SVD shape: {char_svd.shape}")
print(f"Total SVD features: {word_svd.shape[1] + char_svd.shape[1]}")

# Apply LDA (same as exp_005)
lda = LatentDirichletAllocation(
    n_components=15,
    random_state=42,
    learning_method='batch',
    max_iter=20
)

# Fit LDA on word TF-IDF features (common practice)
lda_topics = lda.fit_transform(word_features)
print(f"LDA topics shape: {lda_topics.shape}")
print(f"LDA topic probabilities range: [{lda_topics.min():.4f}, {lda_topics.max():.4f}]")

In [None]:
# Analyze topic coherence and interpretability
print("=== LDA Topic Interpretation ===")
feature_names = word_tfidf.get_feature_names_out()

for topic_idx in range(15):
    top_features_idx = lda.components_[topic_idx].argsort()[-10:][::-1]
    top_features = [feature_names[i] for i in top_features_idx]
    print(f"\nTopic {topic_idx}: {', '.join(top_features)}")

# Analyze correlation between LDA topics and target
topic_correlations = []
for i in range(15):
    corr = np.corrcoef(lda_topics[:, i], y)[0, 1]
    topic_correlations.append(corr)

print("\n=== LDA Topic Correlations with Target ===")
for i, corr in enumerate(topic_correlations):
    print(f"Topic {i}: {corr:.4f}")

# Check if any topics have meaningful correlation
strong_topics = [i for i, corr in enumerate(topic_correlations) if abs(corr) > 0.05]
print(f"\nTopics with |corr| > 0.05: {strong_topics}")
print(f"Max absolute correlation: {max(abs(c) for c in topic_correlations):.4f}")

# Visualize topic correlations
plt.figure(figsize=(12, 6))
plt.bar(range(15), topic_correlations)
plt.axhline(y=0, color='r', linestyle='--', alpha=0.5)
plt.title('LDA Topic Correlations with Target (Pizza Request Success)')
plt.xlabel('Topic ID')
plt.ylabel('Correlation')
plt.show()

In [None]:
# Check redundancy between LDA topics and SVD components
from sklearn.metrics import pairwise_distances

# Calculate correlation matrix between LDA topics and SVD components
svd_combined = np.hstack([word_svd, char_svd])
print(f"SVD combined shape: {svd_combined.shape}")

# Compute maximum correlation between each LDA topic and any SVD component
max_correlations = []
for i in range(15):
    correlations = []
    for j in range(svd_combined.shape[1]):
        corr = np.corrcoef(lda_topics[:, i], svd_combined[:, j])[0, 1]
        correlations.append(abs(corr))
    max_correlations.append(max(correlations))

print("\n=== LDA Topic Redundancy with SVD Components ===")
for i, max_corr in enumerate(max_correlations):
    print(f"Topic {i}: max |corr| with SVD = {max_corr:.4f}")

highly_redundant = [i for i, c in enumerate(max_correlations) if c > 0.7]
print(f"\nHighly redundant topics (|corr| > 0.7): {highly_redundant}")
print(f"Average max correlation: {np.mean(max_correlations):.4f}")

# Check if LDA adds unique information beyond SVD
# Fit LightGBM with and without LDA topics

# Prepare numeric features (same as exp_005)
numeric_features = [
    'requester_account_age_in_days',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_at_request'
]

# Create log transforms
for col in numeric_features:
    df_train[f'{col}_log'] = np.log1p(df_train[col])

df_train['text_length'] = df_train['combined_text'].str.len()
df_train['word_count'] = df_train['combined_text'].str.split().str.len()
df_train['upvotes_per_post'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_train['account_age_years'] = df_train['requester_account_age_in_days'] / 365.25

numeric_features_final = [
    'requester_account_age_in_days_log',
    'requester_upvotes_plus_downvotes_at_request_log',
    'requester_number_of_posts_at_request_log',
    'requester_number_of_comments_at_request_log',
    'text_length',
    'word_count',
    'upvotes_per_post',
    'comments_per_post',
    'account_age_years'
]

X_numeric = df_train[numeric_features_final].fillna(0)
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Combine features
X_svd_only = np.hstack([word_svd, char_svd, X_numeric_scaled])
X_with_lda = np.hstack([word_svd, char_svd, lda_topics, X_numeric_scaled])

print(f"\n=== Feature Set Comparison ===")
print(f"SVD only: {X_svd_only.shape[1]} features")
print(f"SVD + LDA: {X_with_lda.shape[1]} features")
print(f"LDA adds: {X_with_lda.shape[1] - X_svd_only.shape[1]} features")

In [None]:
# Quick CV comparison to verify the performance drop
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_features(X, y, model_params=None):
    if model_params is None:
        model_params = {
            'n_estimators': 500,
            'learning_rate': 0.05,
            'num_leaves': 64,
            'max_depth': 7,
            'min_child_samples': 50,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'random_state': 42,
            'verbose': -1
        }
    
    scores = []
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**model_params)
        model.fit(X_train, y_train)
        
        pred = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, pred)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

print("=== Quick CV Comparison ===")
print("Note: This uses full-data fitted features (leakage), so scores will be inflated")
print("But it helps understand relative performance...")

score_svd_only, std_svd_only = evaluate_features(X_svd_only, y)
score_with_lda, std_with_lda = evaluate_features(X_with_lda, y)

print(f"SVD only: {score_svd_only:.4f} ± {std_svd_only:.4f}")
print(f"SVD + LDA: {score_with_lda:.4f} ± {std_with_lda:.4f}")
print(f"Difference: {score_with_lda - score_svd_only:.4f}")

if score_with_lda < score_svd_only:
    print("\n❌ LDA topics hurt performance (consistent with exp_005)")
else:
    print("\n✓ LDA topics help performance")

In [None]:
# Analyze why LDA might be hurting
print("=== Analysis: Why LDA Hurt Performance ===\n")

print("1. REDUNDANCY WITH SVD:")
print(f"   - Average max correlation between LDA topics and SVD: {np.mean(max_correlations):.4f}")
print(f"   - {len(highly_redundant)} topics have |corr| > 0.7 with SVD components")
print(f"   - LDA and SVD both capture latent structure, but SVD is already optimized")

print("\n2. TOPIC QUALITY:")
print(f"   - Max topic correlation with target: {max(abs(c) for c in topic_correlations):.4f}")
print(f"   - Topics with |corr| > 0.05: {len(strong_topics)}")
if len(strong_topics) == 0:
    print("   - ⚠️ NO topics have meaningful correlation with target!")
    print("   - LDA is capturing themes, but they don't predict pizza success")

print("\n3. NOISE ADDITION:")
print(f"   - LDA adds 15 features that are mostly noise")
print(f"   - LightGBM may be overfitting to these noisy features")
print(f"   - Feature selection or dimensionality reduction needed")

print("\n4. HYPERPARAMETER ISSUES:")
print(f"   - Used n_components=15 (may be too many)")
print(f"   - Used max_iter=20 (may need more iterations)")
print(f"   - Used default learning method (batch)")
print(f"   - Topics may not be well-converged")

print("\n=== RECOMMENDATIONS ===")
print("1. REMOVE LDA entirely - SVD already captures latent structure well")
print("2. Focus on Stanford linguistic cues (counts, not binary)")
print("3. Add temporal features (hour, day of week)")
print("4. Add interaction features (text_length × karma, etc.)")
print("5. Try different SVD component counts (optimize this instead)")
print("6. If re-trying LDA: use fewer topics (5-10), more iterations, tune hyperparameters")

In [None]:
# Check feature importance from exp_005 to see if LDA topics were used
print("=== Feature Importance Analysis ===")
print("(Would need to load exp_005 model to see actual importance)")
print("Based on typical LightGBM behavior:")
print("- LDA topics likely have low importance")
print("- SVD components and numeric features dominate")
print("- Adding 15 low-importance features can hurt generalization")

# Let's also check if we should try different SVD component counts
print("\n=== SVD Component Optimization ===")
print("Current: 50 word + 25 char = 75 components")
print("Competition winners typically use 50-100 total components")
print("Recommend trying:")
print("- 75 word + 25 char = 100 total")
print("- 100 word + 50 char = 150 total (but risk overfitting)")
print("- 40 word + 20 char = 60 total (more aggressive reduction)")

# Variance explained analysis
print(f"\nCurrent SVD variance explained:")
print(f"Word SVD (50 comps): {svd_word.explained_variance_ratio_.sum():.4f} ({svd_word.explained_variance_ratio_.sum()*100:.2f}%)")
print(f"Char SVD (25 comps): {svd_char.explained_variance_ratio_.sum():.4f} ({svd_char.explained_variance_ratio_.sum()*100:.2f}%)")
print(f"Combined: {(svd_word.explained_variance_ratio_.sum() + svd_char.explained_variance_ratio_.sum()):.4f}")

print(f"\nRecommend: Try 75 word + 25 char = 100 components total")
print(f"This should capture ~25-30% variance while reducing overfitting risk")