# Baseline Experiment 001

Simple baseline using TF-IDF text features + basic tabular features with Logistic Regression.

Features:
- TF-IDF on request_text_edit_aware
- Basic tabular features (requester_account_age, requester_upvotes_plus_downvotes_at_retrieval, etc.)
- Simple flair encoding
- Text length features

In [5]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading training data...")
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Positive rate: {np.mean([d['requester_received_pizza'] for d in train_data]):.3f}")

Loading training data...
Training samples: 2878
Positive rate: 0.248


In [6]:
# Create DataFrame and basic features
print("Creating basic features...")

train_df = pd.DataFrame(train_data)

# Basic text features (safe - available at request time)
train_df['text_length'] = train_df['request_text_edit_aware'].str.len()
train_df['title_length'] = train_df['request_title'].str.len()
train_df['total_length'] = train_df['text_length'] + train_df['title_length']

# Vote ratios - use _at_request (safe) instead of _at_retrieval (leaky)
train_df['requester_upvotes_ratio'] = train_df['requester_upvotes_plus_downvotes_at_request'] / (train_df['requester_upvotes_plus_downvotes_at_request'] + 1)

# Account age - use _at_request (safe)
train_df['account_age_days'] = train_df['requester_account_age_in_days_at_request']

# Comments at request (safe) - NOT at retrieval (leaky)
train_df['comments_at_request'] = train_df['requester_number_of_comments_in_raop_at_request']

# Posts at request (safe)
train_df['posts_at_request'] = train_df['requester_number_of_posts_on_raop_at_request']

print("Features created:")
print(train_df[['text_length', 'title_length', 'requester_upvotes_ratio', 
                'account_age_days', 'comments_at_request', 'posts_at_request']].head())

Creating basic features...
Features created:
   text_length  title_length  requester_upvotes_ratio  account_age_days  \
0          214            65                 0.875000          0.000000   
1          169           122                 0.999315         99.526863   
2          694            85                 0.750000          0.000000   
3         1028            39                 0.993976        491.088264   
4          163            33                 0.999476        369.417558   

   comments_at_request  posts_at_request  
0                    0                 0  
1                    0                 0  
2                    0                 0  
3                   33                 1  
4                    0                 0  


In [7]:
# Prepare text data for TF-IDF
print("Preparing TF-IDF features...")

# Use request_text_edit_aware to avoid leakage from success indicators
text_data = train_df['request_text_edit_aware'].fillna('')

# TF-IDF vectorizer with reasonable parameters
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features to prevent overfitting
    stop_words='english',
    ngram_range=(1, 2),  # Include bigrams
    min_df=2,  # Ignore very rare terms
    max_df=0.95  # Ignore very common terms
)

tfidf_features = tfidf.fit_transform(text_data)
print(f"TF-IDF shape: {tfidf_features.shape}")

# Tabular features - ONLY use _at_request features (safe)
tabular_features = train_df[[
    'text_length', 'title_length', 'total_length',
    'requester_upvotes_ratio',
    'account_age_days',
    'comments_at_request',
    'posts_at_request'
]].values

print(f"Tabular features shape: {tabular_features.shape}")

# Scale tabular features
scaler = StandardScaler()
tabular_features_scaled = scaler.fit_transform(tabular_features)

# Combine features
from scipy.sparse import hstack, csr_matrix
X = hstack([tfidf_features, csr_matrix(tabular_features_scaled)])

# Target
y = train_df['requester_received_pizza'].values

print(f"Combined feature shape: {X.shape}")
print(f"Target distribution: {np.bincount(y)}")

Preparing TF-IDF features...


TF-IDF shape: (2878, 5000)
Tabular features shape: (2878, 7)
Combined feature shape: (2878, 5007)
Target distribution: [2163  715]


In [8]:
# Stratified K-Fold CV
print("Running Stratified 5-fold CV...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}")
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Calculate class weight for imbalance
    pos_rate = np.mean(y_train)
    scale_pos_weight = (1 - pos_rate) / pos_rate
    print(f"  Positive rate: {pos_rate:.3f}, Scale pos weight: {scale_pos_weight:.2f}")
    
    # Train logistic regression with class weighting
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_auc)
    print(f"  Fold AUC: {fold_auc:.4f}")

# Overall CV score
mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print(f"\nCV Score: {mean_auc:.4f} ± {std_auc:.4f}")

# OOF AUC
oof_auc = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_auc:.4f}")

Running Stratified 5-fold CV...

Fold 1
  Positive rate: 0.248, Scale pos weight: 3.02


  Fold AUC: 0.6454

Fold 2
  Positive rate: 0.248, Scale pos weight: 3.02


  Fold AUC: 0.6501

Fold 3
  Positive rate: 0.248, Scale pos weight: 3.02


  Fold AUC: 0.6534

Fold 4
  Positive rate: 0.248, Scale pos weight: 3.03


  Fold AUC: 0.5985

Fold 5
  Positive rate: 0.248, Scale pos weight: 3.03


  Fold AUC: 0.6556

CV Score: 0.6406 ± 0.0213
OOF AUC: 0.6398


In [9]:
# Load test data and create predictions
print("Loading test data...")
test_path = "/home/data/test.json"
with open(test_path, 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test samples: {len(test_df)}")

# Create same features for test data - use ONLY _at_request features
print("Creating test features...")

# Text features
test_df['text_length'] = test_df['request_text_edit_aware'].str.len()
test_df['title_length'] = test_df['request_title'].str.len()
test_df['total_length'] = test_df['text_length'] + test_df['title_length']

# Vote ratios - use _at_request
test_df['requester_upvotes_ratio'] = test_df['requester_upvotes_plus_downvotes_at_request'] / (test_df['requester_upvotes_plus_downvotes_at_request'] + 1)

# Account age - use _at_request
test_df['account_age_days'] = test_df['requester_account_age_in_days_at_request']

# Comments and posts at request
test_df['comments_at_request'] = test_df['requester_number_of_comments_in_raop_at_request']
test_df['posts_at_request'] = test_df['requester_number_of_posts_on_raop_at_request']

# TF-IDF on test text
test_text_data = test_df['request_text_edit_aware'].fillna('')
test_tfidf_features = tfidf.transform(test_text_data)

# Tabular features - same order as training
test_tabular_features = test_df[[
    'text_length', 'title_length', 'total_length',
    'requester_upvotes_ratio',
    'account_age_days',
    'comments_at_request',
    'posts_at_request'
]].values

test_tabular_features_scaled = scaler.transform(test_tabular_features)

# Combine features
from scipy.sparse import hstack
X_test = hstack([test_tfidf_features, test_tabular_features_scaled])

print(f"Test feature shape: {X_test.shape}")

# Make predictions using the last fold model (for linear models, this is fine)
print("Making predictions...")
test_predictions = model.predict_proba(X_test)[:, 1]

# Create submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("\nSubmission preview:")
print(submission_df.head())
print(f"\nPrediction stats: min={test_predictions.min():.3f}, max={test_predictions.max():.3f}, mean={test_predictions.mean():.3f}")

# Save submission
submission_path = "/home/submission/submission_001_baseline_clean.csv"
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

Loading test data...
Test samples: 1162
Creating test features...
Test feature shape: (1162, 5007)
Making predictions...

Submission preview:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.305745
1   t3_roiuw                  0.367161
2   t3_mjnbq                  0.369929
3   t3_t8wd1                  0.467939
4  t3_1m4zxu                  0.332718

Prediction stats: min=0.103, max=0.988, mean=0.442

Submission saved to: /home/submission/submission_001_baseline_clean.csv
