# Experiment 004: Logistic Regression Test

**Goal**: Test Logistic Regression vs LightGBM on TF-IDF features

**Hypothesis**: Research shows Logistic Regression often outperforms tree models on TF-IDF features

**Expected Improvement**: 0.67-0.70 AUC (from current 0.6555)

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Loading data...")
# Load train data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Extract target and create DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Target variable
y = train_df['requester_received_pizza'].astype(int)
print(f"Positive rate: {y.mean():.3f}")
print(f"Class imbalance ratio: {1/y.mean():.3f}")

Loading data...
Train samples: 2878
Test samples: 1162
Positive rate: 0.248
Class imbalance ratio: 4.025


In [2]:
# Feature Engineering Functions

def create_psycholinguistic_features(df):
    """Create psycholinguistic features with phrase patterns"""
    features = {}
    
    # Reciprocity phrases (more specific patterns)
    reciprocity_patterns = [
        r'\bpay\s+(?:back|forward|it\s+forward)',
        r'\breturn\s+(?:the\s+)?favor',
        r'\bwhen\s+i\s+get\s+(?:paid|paycheck)',
        r'\bnext\s+(?:week|month|payday)',
        r'\bforward\s+the\s+kindness',
        r'\bpass\s+it\s+on'
    ]
    
    # Hardship phrases
    hardship_patterns = [
        r'\b(?:lost|lose|losing)\s+(?:my\s+)?job',
        r'\bmedical\s+(?:bills?|expenses?)',
        r'\bcar\s+(?:broke\s+down?|repair)',
        r'\beviction\s+(?:notice)?',
        r'\bsingle\s+(?:parent|mom|dad)',
        r'\bunexpected\s+(?:expenses?|bills?)',
        r'\bran\s+out\s+of\s+(?:money|cash)',
        r'\bno\s+(?:food|money|cash)'
    ]
    
    # Family phrases
    family_patterns = [
        r'\bfamily\b',
        r'\bkids?\b',
        r'\bchildren\b',
        r'\bmother\b',
        r'\bfather\b',
        r'\bdaughter\b',
        r'\bson\b',
        r'\bwife\b',
        r'\bhusband\b',
        r'\bbaby\b'
    ]
    
    # Gratitude phrases
    gratitude_patterns = [
        r'\bthank\s+(?:you|you\s+so\s+much|you\s+in\s+advance)',
        r'\bwould\s+(?:appreciate|be\s+grateful)',
        r'\bgrateful\s+(?:for|if)',
        r'\bappreciate\s+(?:it|any|your)',
        r'\bbless\s+(?:you|your\s+heart)',
        r'\bkind\s+(?:stranger|soul|person)',
        r'\bgod\s+bless',
        r'\bso\s+(?:thankful|grateful)'
    ]
    
    # Pizza-specific terms
    pizza_patterns = [
        r'\bpizza\b',
        r'\bhungry\b',
        r'\bfood\b',
        r'\bmeal\b',
        r'\beat\b',
        r'\bstarving\b',
        r'\bdinner\b',
        r'\blunch\b',
        r'\bsupper\b',
        r'\bdominos\b',
        r'\bpapa\s+johns?\b',
        r'\bpizza\s+hut\b'
    ]
    
    text_column = 'request_text_edit_aware'
    
    # Count patterns for each category
    features['reciprocity_phrases'] = df[text_column].apply(
        lambda x: sum(len(re.findall(pattern, x.lower())) for pattern in reciprocity_patterns)
    )
    
    features['hardship_phrases'] = df[text_column].apply(
        lambda x: sum(len(re.findall(pattern, x.lower())) for pattern in hardship_patterns)
    )
    
    features['family_phrases'] = df[text_column].apply(
        lambda x: sum(len(re.findall(pattern, x.lower())) for pattern in family_patterns)
    )
    
    features['gratitude_phrases'] = df[text_column].apply(
        lambda x: sum(len(re.findall(pattern, x.lower())) for pattern in gratitude_patterns)
    )
    
    features['pizza_terms'] = df[text_column].apply(
        lambda x: sum(len(re.findall(pattern, x.lower())) for pattern in pizza_patterns)
    )
    
    return pd.DataFrame(features)

def create_metadata_features(df):
    """Create metadata features from the dataframe"""
    features = {}
    
    # Text length features
    features['text_length'] = df['request_text_edit_aware'].str.len()
    features['word_count'] = df['request_text_edit_aware'].str.split().str.len()
    features['sentence_count'] = df['request_text_edit_aware'].str.count(r'[.!?]+') + 1
    
    # Account features
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['raop_activity_days'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # Activity features
    features['total_comments'] = df['requester_number_of_comments_at_request']
    features['total_posts'] = df['requester_number_of_posts_at_request']
    features['raop_comments'] = df['requester_number_of_comments_in_raop_at_request']
    features['raop_posts'] = df['requester_number_of_posts_on_raop_at_request']
    features['subreddit_diversity'] = df['requester_number_of_subreddits_at_request']
    
    # Voting features (if available)
    if 'requester_upvotes_minus_downvotes_at_request' in df.columns:
        features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
        features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    else:
        features['upvotes_minus_downvotes'] = 0
        features['upvotes_plus_downvotes'] = 0
    
    # Time features
    features['unix_timestamp'] = df['unix_timestamp_of_request']
    features['hour_of_day'] = (df['unix_timestamp_of_request'] // 3600) % 24
    features['day_of_week'] = (df['unix_timestamp_of_request'] // (24 * 3600)) % 7
    
    # Specific high-value features
    # Imgur links
    features['has_imgur_link'] = df['request_text_edit_aware'].str.contains(
        r'https?://(?:www\.)?imgur\.com', case=False, na=False
    ).astype(int)
    
    # Key phrases based on character n-gram analysis
    features['has_for_a'] = df['request_text_edit_aware'].str.contains(
        r'\bfor\s+a\b', case=False, na=False
    ).astype(int)
    
    features['has_please'] = df['request_text_edit_aware'].str.contains(
        r'\bplease\b', case=False, na=False
    ).astype(int)
    
    features['has_this'] = df['request_text_edit_aware'].str.contains(
        r'\bthis\b', case=False, na=False
    ).astype(int)
    
    return pd.DataFrame(features)

In [3]:
# Create TF-IDF features
print("Creating TF-IDF features...")

# Word n-grams (1-3)
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=3000,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

# Character n-grams (3-5) - captures patterns like 'pizza', 'please', 'help'
char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=2000,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

# Fit on combined train + test to ensure consistent vocabulary
combined_text = pd.concat([
    train_df['request_text_edit_aware'],
    test_df['request_text_edit_aware']
], axis=0)

word_vectorizer.fit(combined_text)
char_vectorizer.fit(combined_text)

# Transform train and test
X_train_word = word_vectorizer.transform(train_df['request_text_edit_aware'])
X_test_word = word_vectorizer.transform(test_df['request_text_edit_aware'])

X_train_char = char_vectorizer.transform(train_df['request_text_edit_aware'])
X_test_char = char_vectorizer.transform(test_df['request_text_edit_aware'])

print(f"Word TF-IDF shape: {X_train_word.shape}")
print(f"Char TF-IDF shape: {X_train_char.shape}")

Creating TF-IDF features...


Word TF-IDF shape: (2878, 3000)
Char TF-IDF shape: (2878, 2000)


In [4]:
# Create all features
print("Creating psycholinguistic features...")
psy_train = create_psycholinguistic_features(train_df)
psy_test = create_psycholinguistic_features(test_df)

print("Creating metadata features...")
meta_train = create_metadata_features(train_df)
meta_test = create_metadata_features(test_df)

# Combine all features
X_train_dense = pd.concat([psy_train, meta_train], axis=1)
X_test_dense = pd.concat([psy_test, meta_test], axis=1)

print(f"Dense features shape: {X_train_dense.shape}")
print(f"Dense feature columns: {list(X_train_dense.columns)}")

Creating psycholinguistic features...


Creating metadata features...
Dense features shape: (2878, 24)
Dense feature columns: ['reciprocity_phrases', 'hardship_phrases', 'family_phrases', 'gratitude_phrases', 'pizza_terms', 'text_length', 'word_count', 'sentence_count', 'account_age_days', 'raop_activity_days', 'total_comments', 'total_posts', 'raop_comments', 'raop_posts', 'subreddit_diversity', 'upvotes_minus_downvotes', 'upvotes_plus_downvotes', 'unix_timestamp', 'hour_of_day', 'day_of_week', 'has_imgur_link', 'has_for_a', 'has_please', 'has_this']


In [5]:
# Combine sparse and dense features
print("Combining features...")

# For sparse features (TF-IDF)
X_train_sparse = hstack([X_train_word, X_train_char])
X_test_sparse = hstack([X_test_word, X_test_char])

print(f"Sparse features shape: {X_train_sparse.shape}")

# For dense features, convert to sparse matrix
from scipy.sparse import csr_matrix

X_train_dense_sparse = csr_matrix(X_train_dense.values)
X_test_dense_sparse = csr_matrix(X_test_dense.values)

# Combine all features
X_train = hstack([X_train_sparse, X_train_dense_sparse])
X_test = hstack([X_test_sparse, X_test_dense_sparse])

print(f"Final training matrix shape: {X_train.shape}")
print(f"Final test matrix shape: {X_test.shape}")

Combining features...


Sparse features shape: (2878, 5000)
Final training matrix shape: (2878, 5024)
Final test matrix shape: (1162, 5024)


In [6]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)

# Store results
logreg_scores = []
lgbm_scores = []
oof_predictions_lr = np.zeros(len(train_df))
oof_predictions_lgb = np.zeros(len(train_df))

In [7]:
# Cross-validation loop
print(f"Starting {n_folds}-fold CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train Logistic Regression
    print("  Training Logistic Regression...")
    lr_model = LogisticRegression(
        class_weight='balanced',
        solver='saga',
        max_iter=1000,
        C=1.0,
        random_state=SEED,
        n_jobs=-1
    )
    lr_model.fit(X_tr, y_tr)
    
    # Predict with Logistic Regression
    val_pred_lr = lr_model.predict_proba(X_val)[:, 1]
    score_lr = roc_auc_score(y_val, val_pred_lr)
    logreg_scores.append(score_lr)
    oof_predictions_lr[val_idx] = val_pred_lr
    print(f"  Logistic Regression AUC: {score_lr:.4f}")
    
    # Train LightGBM
    print("  Training LightGBM...")
    lgb_model = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        scale_pos_weight=3.025,
        random_state=SEED,
        n_jobs=-1,
        verbose=-1
    )
    lgb_model.fit(X_tr, y_tr)
    
    # Predict with LightGBM
    val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
    score_lgb = roc_auc_score(y_val, val_pred_lgb)
    lgbm_scores.append(score_lgb)
    oof_predictions_lgb[val_idx] = val_pred_lgb
    print(f"  LightGBM AUC: {score_lgb:.4f}")

# Print results
print(f"\n{'='*50}")
print("CROSS-VALIDATION RESULTS")
print(f"{'='*50}")
print(f"Logistic Regression: {np.mean(logreg_scores):.4f} ± {np.std(logreg_scores):.4f}")
print(f"LightGBM:          {np.mean(lgbm_scores):.4f} ± {np.std(lgbm_scores):.4f}")
print(f"\nOOF Logistic Regression AUC: {roc_auc_score(y, oof_predictions_lr):.4f}")
print(f"OOF LightGBM AUC:            {roc_auc_score(y, oof_predictions_lgb):.4f}")

Starting 5-fold CV...

Fold 1/5
  Training Logistic Regression...


  Logistic Regression AUC: 0.6005
  Training LightGBM...


  LightGBM AUC: 0.6903

Fold 2/5
  Training Logistic Regression...


  Logistic Regression AUC: 0.5737
  Training LightGBM...


  LightGBM AUC: 0.6631

Fold 3/5
  Training Logistic Regression...


  Logistic Regression AUC: 0.5618
  Training LightGBM...


  LightGBM AUC: 0.6587

Fold 4/5
  Training Logistic Regression...


  Logistic Regression AUC: 0.5540
  Training LightGBM...


  LightGBM AUC: 0.6344

Fold 5/5
  Training Logistic Regression...


  Logistic Regression AUC: 0.5818
  Training LightGBM...


  LightGBM AUC: 0.6532

CROSS-VALIDATION RESULTS
Logistic Regression: 0.5743 ± 0.0162
LightGBM:          0.6599 ± 0.0181

OOF Logistic Regression AUC: 0.5260
OOF LightGBM AUC:            0.6591


In [8]:
# Train final models on full data
print("\nTraining final models on full training data...")

# Logistic Regression
final_lr = LogisticRegression(
    class_weight='balanced',
    solver='saga',
    max_iter=1000,
    C=1.0,
    random_state=SEED,
    n_jobs=-1
)
final_lr.fit(X_train, y)

# LightGBM
final_lgb = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    scale_pos_weight=3.025,
    random_state=SEED,
    n_jobs=-1,
    verbose=-1
)
final_lgb.fit(X_train, y)

# Generate predictions
print("\nGenerating predictions on test data...")
pred_lr = final_lr.predict_proba(X_test)[:, 1]
pred_lgb = final_lgb.predict_proba(X_test)[:, 1]

# Simple average ensemble
pred_ensemble = (pred_lr + pred_lgb) / 2

print("Predictions generated successfully!")


Training final models on full training data...



Generating predictions on test data...
Predictions generated successfully!


In [9]:
# Create submission files
print("\nCreating submission files...")

# Create submission directory
import os
os.makedirs('/home/submission', exist_ok=True)

# Logistic Regression submission
submission_lr = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': pred_lr
})
submission_lr.to_csv('/home/submission/submission_lr.csv', index=False)
print("Logistic Regression submission saved: /home/submission/submission_lr.csv")

# LightGBM submission
submission_lgb = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': pred_lgb
})
submission_lgb.to_csv('/home/submission/submission_lgb.csv', index=False)
print("LightGBM submission saved: /home/submission/submission_lgb.csv")

# Ensemble submission
submission_ensemble = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': pred_ensemble
})
submission_ensemble.to_csv('/home/submission/submission_ensemble.csv', index=False)
print("Ensemble submission saved: /home/submission/submission_ensemble.csv")

print(f"\nAll submissions have {len(submission_lr)} rows")
print("\nSubmission files created successfully!")


Creating submission files...
Logistic Regression submission saved: /home/submission/submission_lr.csv
LightGBM submission saved: /home/submission/submission_lgb.csv
Ensemble submission saved: /home/submission/submission_ensemble.csv

All submissions have 1162 rows

Submission files created successfully!


In [10]:
# Feature importance analysis
print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

# For Logistic Regression - get top coefficients
feature_names = []
feature_names.extend([f'word_{i}' for i in range(X_train_word.shape[1])])
feature_names.extend([f'char_{i}' for i in range(X_train_char.shape[1])])
feature_names.extend(list(X_train_dense.columns))

coefficients = final_lr.coef_[0]
top_positive_idx = np.argsort(coefficients)[-10:][::-1]
top_negative_idx = np.argsort(coefficients)[:10]

print("\nTop 10 Positive Coefficients (Logistic Regression):")
for idx in top_positive_idx:
    print(f"  {feature_names[idx]:<30} {coefficients[idx]:.4f}")

print("\nTop 10 Negative Coefficients (Logistic Regression):")
for idx in top_negative_idx:
    print(f"  {feature_names[idx]:<30} {coefficients[idx]:.4f}")

# For LightGBM - get feature importances
lgb_importances = final_lgb.feature_importances_
top_lgb_idx = np.argsort(lgb_importances)[-10:][::-1]

print("\nTop 10 LightGBM Feature Importances:")
for idx in top_lgb_idx:
    print(f"  {feature_names[idx]:<30} {lgb_importances[idx]:.2f}")

print("\nAnalysis complete!")


FEATURE IMPORTANCE ANALYSIS

Top 10 Positive Coefficients (Logistic Regression):
  upvotes_plus_downvotes         0.0000
  upvotes_minus_downvotes        0.0000
  text_length                    0.0000
  account_age_days               0.0000
  word_count                     0.0000
  raop_activity_days             0.0000
  total_comments                 0.0000
  total_posts                    0.0000
  subreddit_diversity            0.0000
  sentence_count                 0.0000

Top 10 Negative Coefficients (Logistic Regression):
  unix_timestamp                 -0.0000
  day_of_week                    -0.0000
  word_1167                      -0.0000
  char_1076                      -0.0000
  char_1600                      -0.0000
  char_1601                      -0.0000
  char_923                       -0.0000
  char_1077                      -0.0000
  char_922                       -0.0000
  word_182                       -0.0000

Top 10 LightGBM Feature Importances:
  unix_timestamp 