# Baseline: TF-IDF + Logistic Regression

This notebook implements a simple baseline using TF-IDF text features and logistic regression.
Following the Phase 1 roadmap from the strategy guide.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [None]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame
train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

# Load test data
print("\nLoading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Check class distribution
print(f"\nClass distribution in training data:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Basic Feature Engineering

In [None]:
# Create text features by combining title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text'].fillna('')

# Basic text statistics
train_df['text_length'] = train_df['combined_text'].str.len()
train_df['word_count'] = train_df['combined_text'].str.split().str.len()
test_df['text_length'] = test_df['combined_text'].str.len()
test_df['word_count'] = test_df['combined_text'].str.split().str.len()

# Select numeric features for concatenation
numeric_features = ['text_length', 'word_count', 'request_number_of_comments_at_retrieval',
                   'requester_number_of_comments_at_retrieval', 'requester_number_of_posts_at_retrieval',
                   'requester_upvotes_minus_downvotes_at_retrieval', 'requester_upvotes_plus_downvotes_at_retrieval']

# Ensure numeric features exist and fill missing values
available_numeric_features = [f for f in numeric_features if f in train_df.columns]
print(f"Available numeric features: {available_numeric_features}")

for f in available_numeric_features:
    train_df[f] = pd.to_numeric(train_df[f], errors='coerce').fillna(0)
    test_df[f] = pd.to_numeric(test_df[f], errors='coerce').fillna(0)

## Cross-Validation Setup

In [None]:
# Setup stratified cross-validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

X_text = train_df['combined_text'].fillna('')
y = train_df['requester_received_pizza'].astype(int)

# Store OOF predictions and test predictions
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))
cv_scores = []

## TF-IDF + Logistic Regression with Cross-Validation

In [None]:
print("Starting cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_text, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_train_text = X_text.iloc[train_idx]
    X_valid_text = X_text.iloc[valid_idx]
    y_train = y.iloc[train_idx]
    y_valid = y.iloc[valid_idx]
    
    # Create TF-IDF features
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
    X_train_tfidf = tfidf.fit_transform(X_train_text)
    X_valid_tfidf = tfidf.transform(X_valid_text)
    
    # Add numeric features if available
    if available_numeric_features:
        X_train_numeric = train_df.iloc[train_idx][available_numeric_features].values
        X_valid_numeric = train_df.iloc[valid_idx][available_numeric_features].values
        
        # Normalize numeric features
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        X_train_numeric = scaler.fit_transform(X_train_numeric)
        X_valid_numeric = scaler.transform(X_valid_numeric)
        
        # Combine TF-IDF and numeric features
        X_train_combined = hstack([X_train_tfidf, X_train_numeric])
        X_valid_combined = hstack([X_valid_tfidf, X_valid_numeric])
    else:
        X_train_combined = X_train_tfidf
        X_valid_combined = X_valid_tfidf
    
    # Train logistic regression with class weighting to handle imbalance
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train_combined, y_train)
    
    # Predict on validation set
    valid_pred = model.predict_proba(X_valid_combined)[:, 1]
    oof_predictions[valid_idx] = valid_pred
    
    # Calculate AUC for this fold
    fold_auc = roc_auc_score(y_valid, valid_pred)
    cv_scores.append(fold_auc)
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")
    
    # Predict on test set for this fold
    X_test_text = test_df['combined_text'].fillna('')
    X_test_tfidf = tfidf.transform(X_test_text)
    
    if available_numeric_features:
        X_test_numeric = test_df[available_numeric_features].values
        X_test_numeric = scaler.transform(X_test_numeric)
        X_test_combined = hstack([X_test_tfidf, X_test_numeric])
    else:
        X_test_combined = X_test_tfidf
    
    test_pred = model.predict_proba(X_test_combined)[:, 1]
    test_predictions += test_pred / n_folds

# Calculate overall CV score
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_auc:.4f}")
print(f"Mean Fold AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the submission has the correct format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission_001_baseline.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction distribution:")
print(submission['requester_received_pizza'].describe())

# Show first few predictions
print("\nFirst 5 predictions:")
print(submission.head())