# Baseline Model - Random Acts of Pizza

Simple baseline using TF-IDF text features + metadata with Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
# Load data
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

print("Loading training data...")
with open(train_path, 'r') as f:
    train_data = json.load(f)

print("Loading test data...")
with open(test_path, 'r') as f:
    test_data = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Loading training data...
Loading test data...
Train samples: 2878
Test samples: 1162


In [5]:
# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("Train columns:", train_df.columns.tolist())
print("\nFirst row keys:", list(train_data[0].keys()) if train_data else "No data")
print("\nTarget distribution:")
if 'requester_received_pizza' in train_df.columns:
    print(train_df['requester_received_pizza'].value_counts())
    print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")
else:
    print("Target column not found in train data")

Train columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', '

In [8]:
# Debug: Check the structure of the data
print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)
print("\nTrain columns:", train_df.columns.tolist()[:10], "...")
print("Test columns:", test_df.columns.tolist()[:10], "...")

# Check for missing columns that we need
required_cols = ['request_title', 'request_text', 'requester_received_pizza']
for col in required_cols:
    print(f"{col} in train: {col in train_df.columns}")
    print(f"{col} in test: {col in test_df.columns}")

# Check a sample row
print("\nSample train row:")
print(train_df.iloc[0][['request_id', 'request_title', 'request_text']])

Train DataFrame shape: (2878, 32)
Test DataFrame shape: (1162, 17)

Train columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request'] ...
Test columns: ['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request'] ...
request_title in train: True
request_title in test: True
request_text in train: True
request_text in test: False
requester_received_pizza in train: True
requester_received_pizza in test: False

Sample train row:
request_id       

In [None]:
# Basic feature engineering
def create_features(df, is_train=True):
    """Create features from the data"""
    features = {}
    
    # Text features - combine title and text
    # Use request_text_edit_aware since it's available in both train and test
    text_col = 'request_text_edit_aware' if 'request_text_edit_aware' in df.columns else 'request_text'
    features['text_combined'] = (df['request_title'].fillna('') + ' ' + 
                                df[text_col].fillna(''))
    
    # Metadata features - only include columns that exist in both train and test
    common_cols = set(train_df.columns) & set(test_df.columns)
    print(f"Common columns between train and test: {len(common_cols)}")
    
    # Account age at request (in days)
    if 'requester_account_age_in_days_at_request' in df.columns:
        features['account_age_days'] = df['requester_account_age_in_days_at_request']
    
    # Number of comments/posts at request time
    if 'requester_number_of_comments_at_request' in df.columns:
        features['num_comments'] = df['requester_number_of_comments_at_request']
    if 'requester_number_of_posts_at_request' in df.columns:
        features['num_posts'] = df['requester_number_of_posts_at_request']
    if 'requester_number_of_posts_on_raop_at_request' in df.columns:
        features['num_posts_raop'] = df['requester_number_of_posts_on_raop_at_request']
    if 'requester_number_of_comments_in_raop_at_request' in df.columns:
        features['num_comments_raop'] = df['requester_number_of_comments_in_raop_at_request']
    
    # Upvote/downvote features
    if 'requester_upvotes_minus_downvotes_at_request' in df.columns:
        features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    if 'requester_upvotes_plus_downvotes_at_request' in df.columns:
        features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Days since first post on RAOP
    if 'requester_days_since_first_post_on_raop_at_request' in df.columns:
        features['days_since_first_raop'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # Number of subreddits
    if 'requester_number_of_subreddits_at_request' in df.columns:
        features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # Post was edited - check if exists
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_was_edited'] = 0  # default value
    
    # User flair (categorical) - check if exists
    if 'requester_user_flair' in df.columns:
        features['user_flair'] = df['requester_user_flair'].fillna('None')
    else:
        features['user_flair'] = 'None'
    
    # Request metadata - check if exists
    if 'request_number_of_comments_at_retrieval' in df.columns:
        features['request_num_comments'] = df['request_number_of_comments_at_retrieval']
    if 'number_of_upvotes_of_request_at_retrieval' in df.columns:
        features['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
    if 'number_of_downvotes_of_request_at_retrieval' in df.columns:
        features['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
    
    # Time features
    if 'unix_timestamp_of_request' in df.columns:
        features['unix_timestamp'] = df['unix_timestamp_of_request']
    
    return pd.DataFrame(features)

# Create features for train and test
print("Creating features...")
train_features = create_features(train_df, is_train=True)
test_features = create_features(test_df, is_train=False)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Train feature columns: {train_features.columns.tolist()}")

In [None]:
# Prepare text and numeric features
# Text features
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

print("Fitting TF-IDF vectorizer...")
text_features_train = vectorizer.fit_transform(train_features['text_combined'])
text_features_test = vectorizer.transform(test_features['text_combined'])

print(f"Text features shape: {text_features_train.shape}")

# Numeric features (exclude text and categorical for now)
numeric_cols = ['account_age_days', 'num_comments', 'num_posts', 'num_posts_raop', 
                'num_comments_raop', 'upvotes_minus_downvotes', 'upvotes_plus_downvotes',
                'days_since_first_raop', 'num_subreddits', 'post_was_edited',
                'request_num_comments', 'request_upvotes', 'request_downvotes',
                'unix_timestamp']

numeric_features_train = train_features[numeric_cols].fillna(0)
numeric_features_test = test_features[numeric_cols].fillna(0)

print(f"Numeric features shape: {numeric_features_train.shape}")

# Combine features
X_train = hstack([text_features_train, numeric_features_train])
X_test = hstack([text_features_test, numeric_features_test])

y_train = train_df['requester_received_pizza'].values

print(f"Final training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")

In [None]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Model - Logistic Regression as baseline
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced'  # Handle class imbalance
)

# Cross-validation
cv_scores = []
fold_predictions = []

print(f"Running {n_splits}-fold cross-validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Train model
    model.fit(X_tr, y_tr)
    
    # Predict
    val_pred = model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC
    auc_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(auc_score)
    
    print(f"Fold {fold + 1} ROC AUC: {auc_score:.4f}")

print(f"\nCV ROC AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

In [None]:
# Train final model on full training data
print("Training final model on full training data...")
model.fit(X_train, y_train)

# Generate predictions for test set
print("Generating test predictions...")
test_predictions = model.predict_proba(X_test)[:, 1]

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

In [None]:
# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Also save for experiment logging
experiment_submission_path = '/home/code/experiments/001_baseline_submission.csv'
submission.to_csv(experiment_submission_path, index=False)
print(f"Experiment submission saved to: {experiment_submission_path}")