# Baseline Experiment: TF-IDF + Logistic Regression

Following the seed prompt strategy, this baseline combines:
- TF-IDF features from request_text and request_title
- Basic tabular features (log transforms, ratios)
- Logistic Regression with class weighting for imbalance
- Stratified 5-fold CV for robust evaluation

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive class rate: {df_train['requester_received_pizza'].mean():.3f}")

In [None]:
# Create basic tabular features
print("Creating tabular features...")

# Log transforms for count features (add 1 to avoid log(0))
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

# Ratios
df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_test['upvotes_per_comment'] = df_test['requester_upvotes_plus_downvotes_at_request'] / (df_test['requester_number_of_comments_at_request'] + 1)

df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_test['comments_per_post'] = df_test['requester_number_of_comments_at_request'] / (df_test['requester_number_of_posts_at_request'] + 1)

# Account age in years
df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.0
df_test['account_age_years'] = df_test['requester_account_age_in_days_at_request'] / 365.0

# Handle requester_user_flair (75% missing) - only in train
df_train['requester_user_flair'] = df_train['requester_user_flair'].fillna('Missing')

# One-hot encode flair (train only)
flair_dummies_train = pd.get_dummies(df_train['requester_user_flair'], prefix='flair')

# For test, create dummy columns with all zeros (since flair not available)
flair_cols = [f'flair_{col}' for col in df_train['requester_user_flair'].unique() if pd.notna(col)]
flair_dummies_test = pd.DataFrame(0, index=df_test.index, columns=flair_cols)

print(f"Tabular features created. Train shape: {flair_dummies_train.shape}, Test shape: {flair_dummies_test.shape}")

In [None]:
# Create basic tabular features
print("Creating tabular features...")

# Log transforms for count features (add 1 to avoid log(0))
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

# Ratios
df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_test['upvotes_per_comment'] = df_test['requester_upvotes_plus_downvotes_at_request'] / (df_test['requester_number_of_comments_at_request'] + 1)

df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_test['comments_per_post'] = df_test['requester_number_of_comments_at_request'] / (df_test['requester_number_of_posts_at_request'] + 1)

# Account age in years
df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.0
df_test['account_age_years'] = df_test['requester_account_age_in_days_at_request'] / 365.0

# Handle requester_user_flair (75% missing)
df_train['requester_user_flair'] = df_train['requester_user_flair'].fillna('Missing')
df_test['requester_user_flair'] = df_test['requester_user_flair'].fillna('Missing')

# One-hot encode flair
flair_dummies_train = pd.get_dummies(df_train['requester_user_flair'], prefix='flair')
flair_dummies_test = pd.get_dummies(df_test['requester_user_flair'], prefix='flair')

# Align dummy columns
flair_dummies_test = flair_dummies_test.reindex(columns=flair_dummies_train.columns, fill_value=0)

print(f"Tabular features created. Shape: {flair_dummies_train.shape}")

In [None]:
# Prepare features for modeling
text_features = df_train['combined_text'].values
test_text_features = df_test['combined_text'].values

# Select numeric features
numeric_features = [
    'requester_number_of_comments_at_request_log',
    'requester_number_of_posts_at_request_log',
    'requester_upvotes_plus_downvotes_at_request_log',
    'upvotes_per_comment',
    'comments_per_post',
    'account_age_years'
]

train_numeric = df_train[numeric_features].fillna(0).values
test_numeric = df_test[numeric_features].fillna(0).values

# Target
y = df_train['requester_received_pizza'].values

print(f"Numeric features shape: {train_numeric.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Logistic Regression with class weighting for imbalance
pos_rate = y.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
print(f"Scale pos weight: {scale_pos_weight:.2f}")

model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

# Store predictions
oof_predictions = np.zeros(len(df_train))
test_predictions = np.zeros(len(df_test))
cv_scores = []

print("Starting 5-fold CV...")

In [None]:
# Cross-validation loop
fold = 0
for train_idx, val_idx in skf.split(df_train, y):
    fold += 1
    print(f"\nFold {fold}/5")
    
    # Split data
    X_train_text, X_val_text = text_features[train_idx], text_features[val_idx]
    X_train_num, X_val_num = train_numeric[train_idx], train_numeric[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Fit TF-IDF on training text
    X_train_text_tfidf = tfidf.fit_transform(X_train_text)
    X_val_text_tfidf = tfidf.transform(X_val_text)
    
    # Combine text and numeric features
    X_train_combined = hstack([X_train_text_tfidf, X_train_num])
    X_val_combined = hstack([X_val_text_tfidf, X_val_num])
    
    # Fit model
    model.fit(X_train_combined, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val_combined)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate score
    score = roc_auc_score(y_val, val_pred)
    cv_scores.append(score)
    print(f"Fold {fold} AUC: {score:.4f}")
    
    # Predict on test set for this fold
    test_text_tfidf = tfidf.transform(test_text_features)
    test_combined = hstack([test_text_tfidf, test_numeric])
    test_pred = model.predict_proba(test_combined)[:, 1]
    test_predictions += test_pred / 5

# Overall CV score
overall_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

In [None]:
# Add flair features to final predictions
# Fit TF-IDF on all training data
final_tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Transform all text
train_text_tfidf = final_tfidf.fit_transform(text_features)
test_text_tfidf = final_tfidf.transform(test_text_features)

# Combine with numeric and flair features
train_combined = hstack([train_text_tfidf, train_numeric, flair_dummies_train.values])
test_combined = hstack([test_text_tfidf, test_numeric, flair_dummies_test.values])

# Train final model
final_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

final_model.fit(train_combined, y)

# Generate final predictions
final_predictions = final_model.predict_proba(test_combined)[:, 1]

print(f"Final model trained on {train_combined.shape[1]} features")
print(f"Final predictions shape: {final_predictions.shape}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': final_predictions
})

# Ensure proper format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction range: {submission['requester_received_pizza'].min():.4f} to {submission['requester_received_pizza'].max():.4f}")
print("\nSubmission preview:")
print(submission.head())