# Baseline Model - Random Acts of Pizza

This notebook creates a baseline model for predicting pizza request success.

## Approach
- TF-IDF features from request_title and request_text
- Basic metadata features (numerical fields)
- Logistic Regression model
- 5-fold cross-validation

In [4]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")

# Convert to DataFrame
df_train = pd.DataFrame(train_data)
print("Training data shape:", df_train.shape)
print("\nAvailable columns:")
print(df_train.columns.tolist())

Loading training data...
Training samples: 2878
Training data shape: (2878, 32)

Available columns:
['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza',

In [None]:
# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)
print("Test data shape:", df_test.shape)
print("\nTest columns:")
print(df_test.columns.tolist())

In [None]:
# Prepare target variable
y_train = df_train['requester_received_pizza'].astype(int)
print(f"Positive rate in training: {y_train.mean():.3f}")

# Basic feature engineering
# Text features: combine title and text
df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text'].fillna('')
df_test['combined_text'] = df_test['request_title'].fillna('') + df_test['request_text'].fillna('')

# Metadata features to use
meta_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'request_number_of_comments_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval'
]

# Fill missing values
for col in meta_features:
    if col in df_train.columns:
        df_train[col] = df_train[col].fillna(0)
        df_test[col] = df_test[col].fillna(0)
    else:
        # If column doesn't exist, create it with zeros
        df_train[col] = 0
        df_test[col] = 0

X_meta_train = df_train[meta_features].values
X_meta_test = df_test[meta_features].values

print("Meta features shape:", X_meta_train.shape)

In [None]:
# Create TF-IDF features for text
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

# Fit on training text
X_text_train = tfidf.fit_transform(df_train['combined_text'])
X_text_test = tfidf.transform(df_test['combined_text'])

print("Text features shape:", X_text_train.shape)

In [None]:
# Combine features
X_train = hstack([X_text_train, X_meta_train])
X_test = hstack([X_text_test, X_meta_test])

print("Final training features shape:", X_train.shape)
print("Final test features shape:", X_test.shape)

In [None]:
# Cross-validation
print("Running 5-fold cross-validation...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
fold = 1

for train_idx, val_idx in cv.split(X_train, y_train):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train model
    model = LogisticRegression(
        max_iter=1000,
        C=1.0,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_tr, y_tr)
    
    # Predict and evaluate
    val_pred = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, val_pred)
    fold_scores.append(score)
    
    print(f"Fold {fold}: ROC AUC = {score:.4f}")
    fold += 1

mean_score = np.mean(fold_scores)
std_score = np.std(fold_scores)
print(f"\nCV Score: {mean_score:.4f} Â± {std_score:.4f}")

In [None]:
# Train final model on all training data
print("Training final model on all data...")
final_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
final_model.fit(X_train, y_train)

# Generate predictions on test set
print("Generating predictions...")
test_predictions = final_model.predict_proba(X_test)[:, 1]

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure correct column order
submission = submission[['request_id', 'requester_received_pizza']]

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print("Submission file created:", submission_path)
print("\nSubmission preview:")
print(submission.head())
print(f"\nPredictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")