# Baseline Experiment 001

Simple baseline using TF-IDF text features + basic tabular features with Logistic Regression.

Features:
- TF-IDF on request_text_edit_aware
- Basic tabular features (requester_account_age, requester_upvotes_plus_downvotes_at_retrieval, etc.)
- Simple flair encoding
- Text length features

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading training data...")
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Positive rate: {np.mean([d['requester_received_pizza'] for d in train_data]):.3f}")

In [None]:
# Create DataFrame and basic features
print("Creating basic features...")

train_df = pd.DataFrame(train_data)

# Basic text features
train_df['text_length'] = train_df['request_text_edit_aware'].str.len()
train_df['title_length'] = train_df['request_title'].str.len()
train_df['total_length'] = train_df['text_length'] + train_df['title_length']

# Flair encoding (handle missing values)
train_df['has_flair'] = train_df['requester_user_flair'].notna().astype(int)
train_df['flair_shroom'] = (train_df['requester_user_flair'] == 'shroom').astype(int)
train_df['flair_pif'] = (train_df['requester_user_flair'] == 'PIF').astype(int)
train_df['flair_other'] = (train_df['has_flair'] & ~train_df['flair_shroom'] & ~train_df['flair_pif']).astype(int)

# Fill missing flair with 'None'
train_df['requester_user_flair'] = train_df['requester_user_flair'].fillna('None')

# Vote ratios
train_df['requester_upvotes_ratio'] = train_df['requester_upvotes_plus_downvotes_at_retrieval'] / (train_df['requester_upvotes_plus_downvotes_at_retrieval'] + 1)

# Account age in days
train_df['account_age_days'] = train_df['requester_account_age_in_days_at_retrieval']

print("Features created:")
print(train_df[['text_length', 'title_length', 'has_flair', 'flair_shroom', 'flair_pif', 'requester_upvotes_ratio', 'account_age_days']].head())

In [None]:
# Prepare text data for TF-IDF
print("Preparing TF-IDF features...")

# Use request_text_edit_aware to avoid leakage from success indicators
text_data = train_df['request_text_edit_aware'].fillna('')

# TF-IDF vectorizer with reasonable parameters
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features to prevent overfitting
    stop_words='english',
    ngram_range=(1, 2),  # Include bigrams
    min_df=2,  # Ignore very rare terms
    max_df=0.95  # Ignore very common terms
)

tfidf_features = tfidf.fit_transform(text_data)
print(f"TF-IDF shape: {tfidf_features.shape}")

# Tabular features
tabular_features = train_df[[
    'text_length', 'title_length', 'total_length',
    'has_flair', 'flair_shroom', 'flair_pif', 'flair_other',
    'requester_upvotes_plus_downvotes_at_retrieval',
    'requester_upvotes_ratio',
    'account_age_days',
    'request_number_of_comments_at_retrieval'
]].values

print(f"Tabular features shape: {tabular_features.shape}")

# Scale tabular features
scaler = StandardScaler()
tabular_features_scaled = scaler.fit_transform(tabular_features)

# Combine features
from scipy.sparse import hstack, csr_matrix

# Convert tabular features to sparse format for efficient stacking
tabular_sparse = csr_matrix(tabular_features_scaled)

X = hstack([tfidf_features, tabular_sparse])
print(f"Combined feature shape: {X.shape}")

# Target
y = train_df['requester_received_pizza'].values
print(f"Target distribution: {np.bincount(y)}")

In [None]:
# Stratified K-Fold CV
print("Running Stratified 5-fold CV...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}")
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Calculate class weight for imbalance
    pos_rate = np.mean(y_train)
    scale_pos_weight = (1 - pos_rate) / pos_rate
    print(f"  Positive rate: {pos_rate:.3f}, Scale pos weight: {scale_pos_weight:.2f}")
    
    # Train logistic regression with class weighting
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_auc)
    print(f"  Fold AUC: {fold_auc:.4f}")

# Overall CV score
mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print(f"\nCV Score: {mean_auc:.4f} Â± {std_auc:.4f}")

# OOF AUC
oof_auc = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_auc:.4f}")

In [None]:
# Load test data and create predictions
print("Loading test data...")
test_path = "/home/data/test.json"
with open(test_path, 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test samples: {len(test_df)}")

# Create same features for test data
print("Creating test features...")

# Text features
test_df['text_length'] = test_df['request_text_edit_aware'].str.len()
test_df['title_length'] = test_df['request_title'].str.len()
test_df['total_length'] = test_df['text_length'] + test_df['title_length']

# Flair encoding
test_df['has_flair'] = test_df['requester_user_flair'].notna().astype(int)
test_df['flair_shroom'] = (test_df['requester_user_flair'] == 'shroom').astype(int)
test_df['flair_pif'] = (test_df['requester_user_flair'] == 'PIF').astype(int)
test_df['flair_other'] = (test_df['has_flair'] & ~test_df['flair_shroom'] & ~test_df['flair_pif']).astype(int)
test_df['requester_user_flair'] = test_df['requester_user_flair'].fillna('None')

# Vote ratios
test_df['requester_upvotes_ratio'] = test_df['requester_upvotes_plus_downvotes_at_retrieval'] / (test_df['requester_upvotes_plus_downvotes_at_retrieval'] + 1)

# Account age
test_df['account_age_days'] = test_df['requester_account_age_in_days_at_retrieval']

# TF-IDF features for test
test_text = test_df['request_text_edit_aware'].fillna('')
test_tfidf = tfidf.transform(test_text)

# Tabular features for test
test_tabular = test_df[[
    'text_length', 'title_length', 'total_length',
    'has_flair', 'flair_shroom', 'flair_pif', 'flair_other',
    'requester_upvotes_plus_downvotes_at_retrieval',
    'requester_upvotes_ratio',
    'account_age_days',
    'request_number_of_comments_at_retrieval'
]].values

test_tabular_scaled = scaler.transform(test_tabular)

# Combine test features
X_test = hstack([test_tfidf, test_tabular_scaled])
print(f"Test feature shape: {X_test.shape}")

# Predict on test
print("Making predictions...")
test_pred = model.predict_proba(X_test)[:, 1]

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_pred
})

print("Submission preview:")
print(submission.head())
print(f"\nPrediction stats: min={test_pred.min():.3f}, max={test_pred.max():.3f}, mean={test_pred.mean():.3f}")

# Save submission
submission_path = "/home/submission/submission_001_baseline.csv"
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")