# Loop 1 Analysis: Investigating Data Leakage and Class Imbalance

This notebook investigates the suspicious perfect CV scores and severe underfitting to the minority class in the baseline experiment.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

## 1. Load and Inspect Data

In [None]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training samples: {len(train_df)}")
print(f"Features: {train_df.columns.tolist()}")
print(f"\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"Success rate: {train_df['requester_received_pizza'].mean():.3f}")

## 2. Investigate Metadata Features for Potential Leakage

The evaluator flagged that metadata features might contain indirect target information. Let's examine correlations.

In [None]:
# Extract metadata features function (from baseline)
def extract_metadata_features(df, is_train=True):
    features = []
    
    # Account age in days
    features.append(df['requester_account_age_in_days_at_request'].fillna(0))
    
    # Subreddit age in days
    features.append(df['requester_days_since_first_post_on_raop_at_request'].fillna(0))
    
    # Number of subreddits user posted in
    features.append(df['requester_number_of_posts_on_raop_at_request'].fillna(0))
    
    # Requester karma scores
    features.append(df['requester_upvotes_plus_downvotes_at_request'].fillna(0))
    features.append(df['requester_upvotes_minus_downvotes_at_request'].fillna(0))
    
    # Number of comments on the request
    if is_train:
        features.append(df['request_number_of_comments_at_retrieval'].fillna(0))
    else:
        features.append(df.get('request_number_of_comments_at_retrieval', pd.Series([0]*len(df))))
    
    # Upvotes/downvotes on the request
    if is_train:
        features.append(df['requester_upvotes_plus_downvotes_at_request'].fillna(0))
        features.append(df['requester_upvotes_minus_downvotes_at_request'].fillna(0))
    else:
        features.append(df.get('requester_upvotes_plus_downvotes_at_request', pd.Series([0]*len(df))))
        features.append(df.get('requester_upvotes_minus_downvotes_at_request', pd.Series([0]*len(df))))
    
    # User flair (one-hot encode)
    flair_train = ['', 'PIZZA', 'shroom', 'trophy', 'favorite', 'mod', 'trusted', 'contest', 'seal', 'hooker', 'raop', 'elite', 'VIP', 'verified', 'helper', 'custom']
    
    if is_train:
        flair_col = 'requester_user_flair'
    else:
        flair_col = 'requester_user_flair'
    
    flair_features = []
    for flair in flair_train:
        if flair_col in df.columns:
            flair_features.append((df[flair_col] == flair).astype(int))
        else:
            flair_features.append(pd.Series([0]*len(df)))
    
    features.extend(flair_features)
    
    return np.column_stack(features)

# Extract features
metadata_features = extract_metadata_features(train_df, is_train=True)
feature_names = ['account_age', 'subreddit_age', 'num_posts_raop', 'karma_total', 'karma_net', 
                'request_comments', 'request_upvotes', 'request_downvotes'] + \
                [f'flair_{i}' for i in range(16)]

metadata_df = pd.DataFrame(metadata_features, columns=feature_names)
metadata_df['target'] = train_df['requester_received_pizza'].astype(int)

print("Metadata features shape:", metadata_df.shape)
print("\nFirst few rows:")
print(metadata_df.head())

In [None]:
# Check correlation with target
correlations = metadata_df.corr()['target'].sort_values(ascending=False)
print("Correlations with target:")
print(correlations.head(10))
print("\n")
print(correlations.tail(10))

## 3. Visualize Feature Distributions by Class

In [None]:
# Plot distributions of top correlated features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Feature Distributions by Class (Top Correlations)', fontsize=16)

# Get top positive and negative correlations
top_pos = correlations.drop('target').head(3).index
top_neg = correlations.drop('target').tail(3).index

plot_features = list(top_pos) + list(top_neg)[:1]

for idx, feature in enumerate(plot_features[:4]):
    ax = axes[idx//2, idx%2]
    
    # Create histogram
    success = metadata_df[metadata_df['target']==1][feature]
    failure = metadata_df[metadata_df['target']==0][feature]
    
    ax.hist(failure, bins=50, alpha=0.7, label='No Pizza', density=True)
    ax.hist(success, bins=50, alpha=0.7, label='Got Pizza', density=True)
    
    ax.set_title(f'{feature} (corr: {correlations[feature]:.3f})')
    ax.legend()
    ax.set_xlabel('Value')
    ax.set_ylabel('Density')

plt.tight_layout()
plt.show()

## 4. Investigate Text Patterns in Successful vs Failed Requests

In [None]:
# Combine title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')

# Separate successful and failed requests
successful = train_df[train_df['requester_received_pizza']==1]['combined_text']
failed = train_df[train_df['requester_received_pizza']==0]['combined_text']

print(f"Successful requests: {len(successful)}")
print(f"Failed requests: {len(failed)}")

# Basic text statistics
print("\n=== Text Length Statistics ===")
print(f"Successful - Mean length: {successful.str.len().mean():.0f} chars")
print(f"Failed - Mean length: {failed.str.len().mean():.0f} chars")
print(f"Successful - Mean words: {successful.str.split().str.len().mean():.0f} words")
print(f"Failed - Mean words: {failed.str.split().str.len().mean():.0f} words")

In [None]:
# Look at common words in successful vs failed requests
from collections import Counter
import re

def get_word_counts(texts, n=20):
    all_text = ' '.join(texts.fillna('').astype(str).tolist())
    words = re.findall(r'\b[a-zA-Z]{3,}\b', all_text.lower())
    return Counter(words).most_common(n)

print("=== Top words in SUCCESSFUL requests ===")
successful_words = get_word_counts(successful, 20)
for word, count in successful_words:
    print(f"{word}: {count}")

print("\n=== Top words in FAILED requests ===")
failed_words = get_word_counts(failed, 20)
for word, count in failed_words:
    print(f"{word}: {count}")

## 5. Reproduce Baseline and Investigate Predictions

In [None]:
# Reproduce baseline model
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_features = vectorizer.fit_transform(train_df['combined_text'])

# Scale metadata features
scaler = StandardScaler()
metadata_scaled = scaler.fit_transform(metadata_features)
metadata_sparse = csr_matrix(metadata_scaled)

# Combine features
X_train = hstack([tfidf_features, metadata_sparse])
y_train = train_df['requester_received_pizza'].astype(int).values

print(f"Combined feature shape: {X_train.shape}")

# Train model
model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Get predictions on training data
train_pred = model.predict_proba(X_train)[:, 1]

print(f"\nTraining predictions range: [{train_pred.min():.6f}, {train_pred.max():.6f}]")
print(f"Training predictions mean: {train_pred.mean():.6f}")
print(f"Training predictions std: {train_pred.std():.6f}")

# Check distribution
print(f"\nPredictions < 0.01: {(train_pred < 0.01).sum()} / {len(train_pred)} ({(train_pred < 0.01).mean():.1%})")
print(f"Predictions > 0.5: {(train_pred > 0.5).sum()} / {len(train_pred)}")
print(f"Predictions > 0.9: {(train_pred > 0.9).sum()} / {len(train_pred)}")

In [None]:
# Investigate feature importance
feature_names_tfidf = vectorizer.get_feature_names_out()
feature_names_all = list(feature_names_tfidf) + list(feature_names)

# Get coefficients
coefficients = model.coef_[0]

# Find most positive and negative coefficients
sorted_idx = np.argsort(np.abs(coefficients))[::-1]

print("=== Top 20 Most Important Features ===")
for i in range(20):
    idx = sorted_idx[i]
    print(f"{feature_names_all[idx]}: {coefficients[idx]:.4f}")

print("\n=== Top 10 Positive Features (predict pizza) ===")
positive_idx = np.argsort(coefficients)[::-1][:10]
for idx in positive_idx:
    print(f"{feature_names_all[idx]}: {coefficients[idx]:.4f}")

print("\n=== Top 10 Negative Features (predict no pizza) ===")
negative_idx = np.argsort(coefficients)[:10]
for idx in negative_idx:
    print(f"{feature_names_all[idx]}: {coefficients[idx]:.4f}")

## 6. Test Cross-Validation with Proper Leakage Prevention

In [None]:
# Create a proper CV that fits vectorizer within each fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
prediction_distributions = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y_train)):
    print(f"\n=== Fold {fold + 1} ===")
    
    # Split data
    train_text = train_df.iloc[train_idx]['combined_text']
    val_text = train_df.iloc[val_idx]['combined_text']
    y_tr = y_train[train_idx]
    y_val = y_train[val_idx]
    
    # Fit vectorizer only on training fold
    vectorizer_fold = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_tr = vectorizer_fold.fit_transform(train_text)
    tfidf_val = vectorizer_fold.transform(val_text)
    
    # Scale metadata features (fit only on training)
    metadata_tr = metadata_features[train_idx]
    metadata_val = metadata_features[val_idx]
    
    scaler_fold = StandardScaler()
    metadata_tr_scaled = scaler_fold.fit_transform(metadata_tr)
    metadata_val_scaled = scaler_fold.transform(metadata_val)
    
    # Convert to sparse
    metadata_tr_sparse = csr_matrix(metadata_tr_scaled)
    metadata_val_sparse = csr_matrix(metadata_val_scaled)
    
    # Combine features
    X_tr = hstack([tfidf_tr, metadata_tr_sparse])
    X_val = hstack([tfidf_val, metadata_val_sparse])
    
    # Train model
    model_fold = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    model_fold.fit(X_tr, y_tr)
    
    # Predict
    val_pred = model_fold.predict_proba(X_val)[:, 1]
    
    # Calculate AUC
    auc_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(auc_score)
    
    # Store prediction distribution
    prediction_distributions.append(val_pred)
    
    print(f"ROC AUC: {auc_score:.4f}")
    print(f"Prediction range: [{val_pred.min():.6f}, {val_pred.max():.6f}]")
    print(f"Prediction mean: {val_pred.mean():.6f}")
    print(f"Predictions > 0.5: {(val_pred > 0.5).sum()} / {len(val_pred)}")

print(f"\n=== CV Results ===")
print(f"Mean ROC AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")
print(f"Individual folds: {cv_scores}")

In [None]:
# Visualize prediction distributions across folds
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
fig.suptitle('Prediction Distributions Across CV Folds', fontsize=16)

for i, (preds, auc) in enumerate(zip(prediction_distributions, cv_scores)):
    axes[i].hist(preds, bins=50, alpha=0.7)
    axes[i].set_title(f'Fold {i+1}\nAUC: {auc:.3f}')
    axes[i].set_xlabel('Prediction')
    axes[i].set_ylabel('Count')
    axes[i].set_xlim(0, 1)

plt.tight_layout()
plt.show()

## 7. Key Findings Summary