# Baseline Model for Random Acts of Pizza

This notebook creates a baseline model using:
1. Text features from request_title and request_text
2. Metadata features (account age, karma, etc.)
3. Simple TF-IDF + Logistic Regression approach

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Features per sample: {len(train_data[0])}")

In [None]:
# Convert to DataFrame for easier manipulation
df_train = pd.DataFrame(train_data)

# Check class distribution
print("Class distribution:")
print(df_train['requester_received_pizza'].value_counts())
print(f"Success rate: {df_train['requester_received_pizza'].mean():.3f}")

# Look at some basic statistics
print("\nBasic statistics:")
print(f"Average account age at request: {df_train['requester_account_age_in_days_at_request'].mean():.1f} days")
print(f"Average upvotes: {df_train['number_of_upvotes_of_request_at_retrieval'].mean():.1f}")
print(f"Average comments: {df_train['request_number_of_comments_at_retrieval'].mean():.1f}")

In [None]:
# Create TF-IDF features for text
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_text = vectorizer.fit_transform(df_train['combined_text'])
print(f"Text features shape: {X_text.shape}")

# Convert to CSR format for efficient indexing
X_text = X_text.tocsr()

# Combine features
X_combined = hstack([X_text, X_meta])
X_combined = X_combined.tocsr()  # Convert to CSR for efficient indexing
print(f"Combined features shape: {X_combined.shape}")

In [None]:
# Cross-validation
print("Running cross-validation...")
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
fold = 1

for train_idx, val_idx in skf.split(X_combined, y):
    # Use .tocsc() for column indexing then convert back
    X_train = X_combined[train_idx, :]
    X_val = X_combined[val_idx, :]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Train model
    model = LogisticRegression(
        C=1.0,
        max_iter=1000,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, y_pred)
    fold_scores.append(score)
    
    print(f"Fold {fold}: AUC = {score:.4f}")
    fold += 1

mean_score = np.mean(fold_scores)
std_score = np.std(fold_scores)
print(f"\nCV Results: {mean_score:.4f} ± {std_score:.4f}")

In [None]:
# Cross-validation
print("Running cross-validation...")
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
fold = 1

for train_idx, val_idx in skf.split(X_combined, y):
    X_train, X_val = X_combined[train_idx], X_combined[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Train model
    model = LogisticRegression(
        C=1.0,
        max_iter=1000,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, y_pred)
    fold_scores.append(score)
    
    print(f"Fold {fold}: AUC = {score:.4f}")
    fold += 1

mean_score = np.mean(fold_scores)
std_score = np.std(fold_scores)
print(f"\nCV Results: {mean_score:.4f} ± {std_score:.4f}")

In [None]:
# Train on full training data
print("Training final model on full data...")
final_model = LogisticRegression(
    C=1.0,
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_combined, y)

# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)

# Create same features for test
df_test['combined_text'] = df_test['request_title'] + ' ' + df_test['request_text_edit_aware']

# Fill missing values in test
for col in metadata_features:
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(0)
    else:
        df_test[col] = 0

X_test_meta = df_test[metadata_features].values
X_test_text = vectorizer.transform(df_test['combined_text'])
X_test_combined = hstack([X_test_text, X_test_meta])

# Make predictions
print("Making predictions...")
test_predictions = final_model.predict_proba(X_test_combined)[:, 1]

# Create submission
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': test_predictions
})

print(f"Submission shape: {submission.shape}")
print(submission.head())

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")