# Evolver Loop 1 Analysis

Goal: Analyze baseline results and identify high-impact improvements for Random Acts of Pizza competition.

Current best CV: 0.6386 (far from gold 0.9791)

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive class rate: {df_train['requester_received_pizza'].mean():.3f}")

Loading data...
Training samples: 2878
Test samples: 1162
Positive class rate: 0.248


## 1. Analyze Text Features

Let's understand what makes a pizza request successful by analyzing the text.

In [2]:
# Combine text features
df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text_edit_aware'].fillna('')
df_test['combined_text'] = df_test['request_title'].fillna('') + ' ' + df_test['request_text_edit_aware'].fillna('')

# Basic text features
df_train['text_length'] = df_train['combined_text'].str.len()
df_train['word_count'] = df_train['combined_text'].str.split().str.len()
df_train['exclamation_count'] = df_train['combined_text'].str.count('!')
df_train['question_count'] = df_train['combined_text'].str.count('\?')
df_train['caps_ratio'] = df_train['combined_text'].str.count('[A-Z]') / df_train['text_length']
df_train['pizza_mention'] = df_train['combined_text'].str.lower().str.contains('pizza').astype(int)

print("Text feature correlations with target:")
text_features = ['text_length', 'word_count', 'exclamation_count', 'question_count', 'caps_ratio', 'pizza_mention']
correlations = df_train[text_features + ['requester_received_pizza']].corr()['requester_received_pizza'].drop('requester_received_pizza')
print(correlations.sort_values(ascending=False))

Text feature correlations with target:
text_length          0.121046
word_count           0.119426
exclamation_count    0.030441
question_count       0.029330
pizza_mention        0.011851
caps_ratio          -0.031435
Name: requester_received_pizza, dtype: float64


## 2. Analyze Tabular Features

Let's examine which metadata features are most predictive.

In [3]:
# Create tabular features
# Log transforms for count features
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

# Ratios
df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)

# Account age
df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.25

# Other features
df_train['post_was_edited'] = df_train['post_was_edited'].astype(int)

# Also create text features for the test set
df_test['text_length'] = df_test['combined_text'].str.len()
df_test['word_count'] = df_test['combined_text'].str.split().str.len()
df_test['exclamation_count'] = df_test['combined_text'].str.count('!')
df_test['question_count'] = df_test['combined_text'].str.count('\?')
df_test['caps_ratio'] = df_test['combined_text'].str.count('[A-Z]') / df_test['text_length']
df_test['pizza_mention'] = df_test['combined_text'].str.lower().str.contains('pizza').astype(int)

print("Tabular feature correlations with target:")
tabular_features = [f'{feat}_log' for feat in count_features] + [
    'upvotes_per_comment', 'comments_per_post', 'account_age_years',
    'post_was_edited', 'requester_upvotes_plus_downvotes_at_request'
]
correlations = df_train[tabular_features + ['requester_received_pizza']].corr()['requester_received_pizza'].drop('requester_received_pizza')
print(correlations.sort_values(ascending=False))

Tabular feature correlations with target:
requester_upvotes_plus_downvotes_at_request_log    0.110471
post_was_edited                                    0.067192
requester_number_of_comments_at_request_log        0.062231
requester_number_of_posts_at_request_log           0.059366
comments_per_post                                  0.056384
upvotes_per_comment                                0.047702
account_age_years                                  0.043374
requester_upvotes_plus_downvotes_at_request        0.033247
Name: requester_received_pizza, dtype: float64


## 3. Analyze Successful vs Unsuccessful Requests

Let's look at examples of successful and unsuccessful requests to understand patterns.

In [None]:
# Sample successful and unsuccessful requests
successful = df_train[df_train['requester_received_pizza'] == 1].sample(3, random_state=42)
unsuccessful = df_train[df_train['requester_received_pizza'] == 0].sample(3, random_state=42)

print("="*80)
print("SUCCESSFUL REQUESTS:")
print("="*80)
for idx, row in successful.iterrows():
    print(f"\n--- Request {row['request_id']} ---")
    print(f"Title: {row['request_title']}")
    print(f"Text: {row['combined_text'][:200]}...")
    print(f"Account age: {row['requester_account_age_in_days_at_request']:.1f} days")
    print(f"Upvotes: {row['requester_upvotes_plus_downvotes_at_request']}")

print("\n" + "="*80)
print("UNSUCCESSFUL REQUESTS:")
print("="*80)
for idx, row in unsuccessful.iterrows():
    print(f"\n--- Request {row['request_id']} ---")
    print(f"Title: {row['request_title']}")
    print(f"Text: {row['combined_text'][:200]}...")
    print(f"Account age: {row['requester_account_age_in_days_at_request']:.1f} days")
    print(f"Upvotes: {row['requester_upvotes_plus_downvotes_at_request']}")

## 4. Feature Importance from Baseline Model

Let's train the baseline model and extract feature importance to understand what's working.

In [None]:
# Prepare features for baseline model
y = df_train['requester_received_pizza'].values

# Text features
text_features = df_train['combined_text'].values
test_text_features = df_test['combined_text'].values

# Numeric features (exclude flair which doesn't exist in test)
numeric_features = [f'{feat}_log' for feat in count_features] + [
    'upvotes_per_comment', 'comments_per_post', 'account_age_years',
    'text_length', 'word_count', 'exclamation_count', 'question_count',
    'caps_ratio', 'pizza_mention', 'post_was_edited'
]

train_numeric = df_train[numeric_features].fillna(0).values
test_numeric = df_test[numeric_features].fillna(0).values

# TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Single fold to get feature importance
train_text_tfidf = tfidf.fit_transform(text_features)
test_text_tfidf = tfidf.transform(test_text_features)

train_combined = hstack([train_text_tfidf, train_numeric])

# Train model
model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

model.fit(train_combined, y)

# Get feature importance
feature_names = list(tfidf.get_feature_names_out()) + numeric_features
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': np.abs(model.coef_[0])
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importance.head(20))

## 5. Analyze Prediction Errors

Let's see what the model gets wrong to identify improvement opportunities.

In [None]:
# Get predictions on training data
train_pred = model.predict_proba(train_combined)[:, 1]

# Add predictions to dataframe
df_train['predicted_prob'] = train_pred

# Find false positives and false negatives (using 0.5 threshold)
threshold = 0.5
false_positives = df_train[(df_train['requester_received_pizza'] == 0) & (df_train['predicted_prob'] > threshold)]
false_negatives = df_train[(df_train['requester_received_pizza'] == 1) & (df_train['predicted_prob'] < threshold)]

print(f"False positives: {len(false_positives)}")
print(f"False negatives: {len(false_negatives)}")

print("\n" + "="*80)
print("SAMPLE FALSE POSITIVES (predicted success, actually failed):")
print("="*80)
for idx, row in false_positives.sample(min(2, len(false_positives)), random_state=42).iterrows():
    print(f"\n--- Request {row['request_id']} (prob: {row['predicted_prob']:.3f}) ---")
    print(f"Title: {row['request_title']}")
    print(f"Text: {row['combined_text'][:150]}...")

print("\n" + "="*80)
print("SAMPLE FALSE NEGATIVES (predicted failure, actually succeeded):")
print("="*80)
for idx, row in false_negatives.sample(min(2, len(false_negatives)), random_state=42).iterrows():
    print(f"\n--- Request {row['request_id']} (prob: {row['predicted_prob']:.3f}) ---")
    print(f"Title: {row['request_title']}")
    print(f"Text: {row['combined_text'][:150]}...")