# Baseline Experiment: TF-IDF + Logistic Regression

Following the seed prompt strategy, this baseline combines:
- TF-IDF features from request_text and request_title
- Basic tabular features (log transforms, ratios)
- Logistic Regression with class weighting for imbalance
- Stratified 5-fold CV for robust evaluation

In [4]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive class rate: {df_train['requester_received_pizza'].mean():.3f}")

Loading data...
Training samples: 2878
Test samples: 1162
Positive class rate: 0.248


In [5]:
# Combine text features
print("Combining text features...")

df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text_edit_aware'].fillna('')
df_test['combined_text'] = df_test['request_title'].fillna('') + ' ' + df_test['request_text_edit_aware'].fillna('')

# Basic text length features
df_train['text_length'] = df_train['combined_text'].str.len()
df_test['text_length'] = df_test['combined_text'].str.len()

df_train['word_count'] = df_train['combined_text'].str.split().str.len()
df_test['word_count'] = df_test['combined_text'].str.split().str.len()

print(f"Combined text examples:")
print(df_train['combined_text'].head(2).values)

Combining text features...
Combined text examples:
["[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy. I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan."
 "[REQUEST] Three (verified) medical students in Pittsburgh this summer doing research.  And we're almost out of loan money. We would all really appreciate it, and would even send a picture of the three of us enjoying the said pizza (if that's your thing).  Just curious to see if this works!!!"]


In [6]:
# Create basic tabular features
print("Creating tabular features...")

# Log transforms for count features (add 1 to avoid log(0))
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

# Ratios
df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_test['upvotes_per_comment'] = df_test['requester_upvotes_plus_downvotes_at_request'] / (df_test['requester_number_of_comments_at_request'] + 1)

df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_test['comments_per_post'] = df_test['requester_number_of_comments_at_request'] / (df_test['requester_number_of_posts_at_request'] + 1)

# Account age in years
df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.25
df_test['account_age_years'] = df_test['requester_account_age_in_days_at_request'] / 365.25

print("Tabular features created successfully")

Creating tabular features...
Tabular features created successfully


In [7]:
# Prepare features for modeling
text_features = df_train['combined_text'].values
test_text_features = df_test['combined_text'].values

# Select numeric features
numeric_features = [
    'requester_number_of_comments_at_request_log',
    'requester_number_of_posts_at_request_log',
    'requester_upvotes_plus_downvotes_at_request_log',
    'upvotes_per_comment',
    'comments_per_post',
    'account_age_years'
]

train_numeric = df_train[numeric_features].fillna(0).values
test_numeric = df_test[numeric_features].fillna(0).values

# Target
y = df_train['requester_received_pizza'].values

print(f"Numeric features shape: {train_numeric.shape}")
print(f"Target shape: {y.shape}")

Numeric features shape: (2878, 6)
Target shape: (2878,)


In [8]:
# Stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Logistic Regression with class weighting for imbalance
pos_rate = y.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
print(f"Scale pos weight: {scale_pos_weight:.2f}")

model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

# Store predictions
oof_predictions = np.zeros(len(df_train))
test_predictions = np.zeros(len(df_test))
cv_scores = []

print("Starting 5-fold CV...")

Scale pos weight: 3.03
Starting 5-fold CV...


In [9]:
# Cross-validation loop
fold = 0
for train_idx, val_idx in skf.split(df_train, y):
    fold += 1
    print(f"\nFold {fold}/5")
    
    # Split data
    X_train_text, X_val_text = text_features[train_idx], text_features[val_idx]
    X_train_num, X_val_num = train_numeric[train_idx], train_numeric[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Fit TF-IDF on training text
    X_train_text_tfidf = tfidf.fit_transform(X_train_text)
    X_val_text_tfidf = tfidf.transform(X_val_text)
    
    # Combine text and numeric features
    X_train_combined = hstack([X_train_text_tfidf, X_train_num])
    X_val_combined = hstack([X_val_text_tfidf, X_val_num])
    
    # Fit model
    model.fit(X_train_combined, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val_combined)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate score
    score = roc_auc_score(y_val, val_pred)
    cv_scores.append(score)
    print(f"Fold {fold} AUC: {score:.4f}")
    
    # Predict on test set for this fold
    test_text_tfidf = tfidf.transform(test_text_features)
    test_combined = hstack([test_text_tfidf, test_numeric])
    test_pred = model.predict_proba(test_combined)[:, 1]
    test_predictions += test_pred / 5

# Overall CV score
overall_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")


Fold 1/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1 AUC: 0.6703

Fold 2/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2 AUC: 0.6343

Fold 3/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3 AUC: 0.6376

Fold 4/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4 AUC: 0.5974

Fold 5/5


Fold 5 AUC: 0.6532

Overall CV AUC: 0.6377
Mean CV AUC: 0.6386 ± 0.0242


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Cross-validation loop
fold = 0
for train_idx, val_idx in skf.split(df_train, y):
    fold += 1
    print(f"\nFold {fold}/5")
    
    # Split data
    X_train_text, X_val_text = text_features[train_idx], text_features[val_idx]
    X_train_num, X_val_num = train_numeric[train_idx], train_numeric[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Fit TF-IDF on training text
    X_train_text_tfidf = tfidf.fit_transform(X_train_text)
    X_val_text_tfidf = tfidf.transform(X_val_text)
    
    # Combine text and numeric features
    X_train_combined = hstack([X_train_text_tfidf, X_train_num])
    X_val_combined = hstack([X_val_text_tfidf, X_val_num])
    
    # Fit model
    model.fit(X_train_combined, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val_combined)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate score
    score = roc_auc_score(y_val, val_pred)
    cv_scores.append(score)
    print(f"Fold {fold} AUC: {score:.4f}")
    
    # Predict on test for this fold
    test_text_tfidf = tfidf.transform(test_text_features)
    test_combined = hstack([test_text_tfidf, test_numeric])
    fold_test_pred = model.predict_proba(test_combined)[:, 1]
    test_predictions += fold_test_pred

# Average test predictions across folds
test_predictions /= 5

# Overall CV score
overall_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"CV scores: {cv_scores}")
print(f"Mean ± Std: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")


Fold 1/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1 AUC: 0.6703

Fold 2/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2 AUC: 0.6343

Fold 3/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3 AUC: 0.6376

Fold 4/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4 AUC: 0.5974

Fold 5/5


Fold 5 AUC: 0.6532

Overall CV AUC: 0.6377
CV scores: [0.6702627626415156, 0.634296419515819, 0.6376394967618987, 0.5974488474488474, 0.6532472157472158, 0.6702627626415156, 0.634296419515819, 0.6376394967618987, 0.5974488474488474, 0.6532472157472158]
Mean ± Std: 0.6386 ± 0.0242


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Generate final predictions using full training data
print("Training final model on full data...")

# Fit TF-IDF on all training data
final_tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Transform all text
train_text_tfidf = final_tfidf.fit_transform(text_features)
test_text_tfidf = final_tfidf.transform(test_text_features)

# Combine with numeric features only (no flair in test data)
train_combined = hstack([train_text_tfidf, train_numeric])
test_combined = hstack([test_text_tfidf, test_numeric])

# Train final model
final_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

final_model.fit(train_combined, y)

# Generate final predictions
final_predictions = final_model.predict_proba(test_combined)[:, 1]

print(f"Final model trained on {train_combined.shape[1]} features")
print(f"Final predictions shape: {final_predictions.shape}")

Training final model on full data...


Final model trained on 5006 features
Final predictions shape: (1162,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
