# Baseline Model: Logistic Regression + TF-IDF

Simple baseline using TF-IDF features on combined text data with Logistic Regression.

## Approach
- Combine request_title and request_text
- TF-IDF vectorization (unigrams and bigrams)
- Logistic Regression with class weighting for imbalance
- 5-fold stratified cross-validation

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

print(f"Training data shape: {train_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts().to_dict()}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

Loading training data...
Training data shape: (2878, 32)
Target distribution: {False: 2163, True: 715}
Positive rate: 0.248


In [6]:
# Combine text features - use request_text_edit_aware for both train and test
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
train_df['combined_text'] = train_df['combined_text'].str.lower()

# Basic text preprocessing - remove extra whitespace
train_df['combined_text'] = train_df['combined_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Combined text sample:")
print(train_df['combined_text'].iloc[0][:200])

Combined text sample:
[request] oceanside, ca. usa- us marine getting ready to deploy. i will soon be going on a long deployment which i'm not aloud to discuss but willing to give some info if you ask. just wanna eat some 


In [7]:
# Create additional simple features
train_df['text_length'] = train_df['combined_text'].str.len()
train_df['word_count'] = train_df['combined_text'].str.split().str.len()

# Handle user flair (critical finding: perfect separation but likely leakage)
# We'll include it but be aware it might be leakage
train_df['user_flair'] = train_df['requester_user_flair'].fillna('None')
train_df['has_flair'] = (train_df['user_flair'] != 'None').astype(int)

print("Feature statistics:")
print(f"Text length - mean: {train_df['text_length'].mean():.1f}, std: {train_df['text_length'].std():.1f}")
print(f"Word count - mean: {train_df['word_count'].mean():.1f}, std: {train_df['word_count'].std():.1f}")
print(f"Has flair: {train_df['has_flair'].sum()} ({train_df['has_flair'].mean():.3f})")

Feature statistics:
Text length - mean: 464.8, std: 351.3
Word count - mean: 88.0, std: 67.8
Has flair: 715 (0.248)


In [8]:
# Prepare data for modeling
X_text = train_df['combined_text']
y = train_df['requester_received_pizza'].values

# Calculate class weight for imbalance
pos_rate = y.mean()
neg_rate = 1 - pos_rate
scale_pos_weight = neg_rate / pos_rate
print(f"Class imbalance - Positive rate: {pos_rate:.3f}, scale_pos_weight: {scale_pos_weight:.3f}")

# 5-fold stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(train_df))

print("\nStarting 5-fold cross-validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_text, y)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train, X_val = X_text.iloc[train_idx], X_text.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create pipeline with TF-IDF and Logistic Regression
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            stop_words='english',
            lowercase=True,
            min_df=2,
            max_df=0.95
        )),
        ('clf', LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            C=1.0,
            random_state=42,
            n_jobs=-1
        ))
    ])
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Predict on validation set
    val_pred = pipeline.predict_proba(X_val)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    auc_score = roc_auc_score(y_val, val_pred)
    fold_scores.append(auc_score)
    
    print(f"  Validation AUC: {auc_score:.4f}")
    print(f"  Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# Overall CV score
mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print(f"\n{'='*50}")
print(f"Cross-Validation Results:")
print(f"Mean AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in fold_scores]}")

# Calculate OOF AUC
oof_auc = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_auc:.4f}")

Class imbalance - Positive rate: 0.248, scale_pos_weight: 3.025

Starting 5-fold cross-validation...

Fold 1


  Validation AUC: 0.6372
  Training samples: 2302, Validation samples: 576

Fold 2


  Validation AUC: 0.6128
  Training samples: 2302, Validation samples: 576

Fold 3


  Validation AUC: 0.6053
  Training samples: 2302, Validation samples: 576

Fold 4


  Validation AUC: 0.5653
  Training samples: 2303, Validation samples: 575

Fold 5


  Validation AUC: 0.6074
  Training samples: 2303, Validation samples: 575

Cross-Validation Results:
Mean AUC: 0.6056 ± 0.0232
Individual folds: ['0.6372', '0.6128', '0.6053', '0.5653', '0.6074']
OOF AUC: 0.6060


In [9]:
# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Test data shape: {test_df.shape}")

# Preprocess test data - use request_text_edit_aware
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['combined_text'].str.lower()
test_df['combined_text'] = test_df['combined_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

# Create simple features
test_df['text_length'] = test_df['combined_text'].str.len()
test_df['word_count'] = test_df['combined_text'].str.split().str.len()
test_df['user_flair'] = test_df['requester_user_flair'].fillna('None') if 'requester_user_flair' in test_df.columns else 'None'
test_df['has_flair'] = (test_df['user_flair'] != 'None').astype(int) if 'requester_user_flair' in test_df.columns else 0

X_test_text = test_df['combined_text']

# Train final model on full training data
print("\nTraining final model on full training data...")
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True,
        min_df=2,
        max_df=0.95
    )),
    ('clf', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        C=1.0,
        random_state=42,
        n_jobs=-1
    ))
])

final_pipeline.fit(X_text, y)

# Generate predictions
test_predictions = final_pipeline.predict_proba(X_test_text)[:, 1]

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test predictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Test predictions mean: {test_predictions.mean():.4f}")

Loading test data...
Test data shape: (1162, 17)

Training final model on full training data...


Test predictions shape: (1162,)
Test predictions range: [0.1515, 0.8548]
Test predictions mean: 0.4534


In [None]:
# Create submission file
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure correct format (0/1 probabilities)
submission_df['requester_received_pizza'] = submission_df['requester_received_pizza'].astype(float)

print("Submission file preview:")
print(submission_df.head())

# Save submission
submission_path = '/home/submission/submission_001_baseline.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format matches sample
sample_submission = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"\nFormat verification:")
print(f"Sample columns: {sample_submission.columns.tolist()}")
print(f"Our columns: {submission_df.columns.tolist()}")
print(f"Sample shape: {sample_submission.shape}")
print(f"Our shape: {submission_df.shape}")
print(f"IDs match: {set(sample_submission['request_id']) == set(submission_df['request_id'])}")