# Baseline Experiment 001

## Phase 1 - Simple Baselines

Following the strategy:
1. LightGBM on tabular features only
2. TF-IDF + LightGBM on text features only
3. Basic ensemble

Key findings from EDA:
- 24.8% success rate (moderate class imbalance)
- request_number_of_comments_at_retrieval has highest correlation (0.291)
- User flair 'shroom' and 'PIF' have 100% success rate (potential leakage)
- Text features: request_title and request_text

In [None]:
# Prepare target
y = train_df['requester_received_pizza'].astype(int)

# Identify key features from EDA
# Tabular features (numeric and categorical) - ONLY use features available at request time
# (features with "at_retrieval" are not available in test data)
tabular_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_days_since_first_post_on_raop_at_request'
]

# Handle user flair (highly predictive but potential leakage)
# Create binary features instead of using raw flair
# Note: test data doesn't have this feature, so we need to handle it carefully
if 'requester_user_flair' in train_df.columns:
    train_df['has_shroom_flair'] = (train_df['requester_user_flair'] == 'shroom').astype(int)
    train_df['has_pif_flair'] = (train_df['requester_user_flair'] == 'PIF').astype(int)
    # For test data, set these to 0 since we don't have flair information
    test_df['has_shroom_flair'] = 0
    test_df['has_pif_flair'] = 0
    # Add flair features to tabular features
    tabular_features.extend(['has_shroom_flair', 'has_pif_flair'])
    print("User flair features added (note: test data has no flair info)")
else:
    print("User flair not available in training data")

# Create text features - use request_text_edit_aware for test (cleaned version)
# For train, we can use either request_text or request_text_edit_aware
# Let's use request_text_edit_aware for consistency
train_df['full_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['full_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

print("Features prepared:")
print(f"Tabular features: {len(tabular_features)}")
print(f"Text feature: full_text")

In [None]:
# Prepare target
y = train_df['requester_received_pizza'].astype(int)

# Identify key features from EDA
# Tabular features (numeric and categorical)
tabular_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'request_number_of_comments_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'requester_days_since_first_post_on_raop_at_request'
]

# Handle user flair (highly predictive but potential leakage)
# Create binary features instead of using raw flair
# Note: test data doesn't have this feature, so we need to handle it carefully
if 'requester_user_flair' in train_df.columns:
    train_df['has_shroom_flair'] = (train_df['requester_user_flair'] == 'shroom').astype(int)
    train_df['has_pif_flair'] = (train_df['requester_user_flair'] == 'PIF').astype(int)
    # For test data, set these to 0 since we don't have flair information
    test_df['has_shroom_flair'] = 0
    test_df['has_pif_flair'] = 0
    # Add flair features to tabular features
    tabular_features.extend(['has_shroom_flair', 'has_pif_flair'])
    print("User flair features added (note: test data has no flair info)")
else:
    print("User flair not available in training data")

# Create text features - use request_text_edit_aware for test (cleaned version)
# For train, we can use either request_text or request_text_edit_aware
# Let's use request_text_edit_aware for consistency
train_df['full_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['full_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

print("Features prepared:")
print(f"Tabular features: {len(tabular_features)}")
print(f"Text feature: full_text")

In [None]:
# Prepare tabular data
X_tabular = train_df[tabular_features].copy()
X_test_tabular = test_df[tabular_features].copy()

# Handle missing values
X_tabular = X_tabular.fillna(0)
X_test_tabular = X_test_tabular.fillna(0)

print("Tabular data prepared")
print(f"Shape: {X_tabular.shape}")
print(f"Test shape: {X_test_tabular.shape}")

# Check for any missing columns in test data
missing_in_test = [col for col in tabular_features if col not in test_df.columns]
if missing_in_test:
    print(f"Warning: These features missing in test data: {missing_in_test}")
    # Remove them from tabular features
    for col in missing_in_test:
        tabular_features.remove(col)
    # Re-prepare data
    X_tabular = train_df[tabular_features].copy()
    X_test_tabular = test_df[tabular_features].copy()
    X_tabular = X_tabular.fillna(0)
    X_test_tabular = X_test_tabular.fillna(0)
    print(f"Updated tabular features: {len(tabular_features)}")

In [None]:
# Model 1: LightGBM on tabular features only
print("="*50)
print("Model 1: LightGBM on tabular features only")
print("="*50)

# Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
tabular_scores = []
tabular_predictions = np.zeros(len(test_df))

for fold, (train_idx, val_idx) in enumerate(cv.split(X_tabular, y)):
    X_train, X_val = X_tabular.iloc[train_idx], X_tabular.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # LightGBM parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'is_unbalance': True  # Handle class imbalance
    }
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val)
    score = roc_auc_score(y_val, val_pred)
    tabular_scores.append(score)
    
    # Test predictions
    fold_pred = model.predict(X_test_tabular)
    tabular_predictions += fold_pred / 5
    
    print(f"Fold {fold+1} AUC: {score:.4f}")

print(f"\nMean AUC: {np.mean(tabular_scores):.4f} ± {np.std(tabular_scores):.4f}")

In [None]:
# Model 2: TF-IDF + LightGBM on text features only
print("="*50)
print("Model 2: TF-IDF + LightGBM on text features only")
print("="*50)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

X_text = vectorizer.fit_transform(train_df['full_text'])
X_test_text = vectorizer.transform(test_df['full_text'])

print(f"TF-IDF shape: {X_text.shape}")

text_scores = []
text_predictions = np.zeros(len(test_df))

for fold, (train_idx, val_idx) in enumerate(cv.split(X_text, y)):
    X_train = X_text[train_idx]
    X_val = X_text[val_idx]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]
    
    # Convert to dense for LightGBM (sample for efficiency)
    # For large sparse matrices, we'll use a subset of features
    if fold == 0:
        # On first fold, select top features by chi-square
        from sklearn.feature_selection import SelectKBest, chi2
        selector = SelectKBest(chi2, k=2000)
        X_train_dense = selector.fit_transform(X_train, y_train)
        X_val_dense = selector.transform(X_val)
        X_test_text_dense = selector.transform(X_test_text)
        
        # Store selector for reuse
        text_selector = selector
    else:
        X_train_dense = text_selector.transform(X_train)
        X_val_dense = text_selector.transform(X_val)
        X_test_text_dense = text_selector.transform(X_test_text)
    
    # LightGBM parameters for text
    params_text = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'is_unbalance': True
    }
    
    train_data = lgb.Dataset(X_train_dense, label=y_train)
    val_data = lgb.Dataset(X_val_dense, label=y_val)
    
    model = lgb.train(
        params_text,
        train_data,
        num_boost_round=500,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val_dense)
    score = roc_auc_score(y_val, val_pred)
    text_scores.append(score)
    
    # Test predictions
    fold_pred = model.predict(X_test_text_dense)
    text_predictions += fold_pred / 5
    
    print(f"Fold {fold+1} AUC: {score:.4f}")

print(f"\nMean AUC: {np.mean(text_scores):.4f} ± {np.std(text_scores):.4f}")

In [None]:
# Model 3: Simple ensemble (average of predictions)
print("="*50)
print("Model 3: Simple Ensemble (Average)")
print("="*50)

# Average predictions
ensemble_predictions = (tabular_predictions + text_predictions) / 2

# For validation, we can approximate ensemble score by averaging fold predictions
# (This is an approximation - true ensemble would require retraining)
ensemble_scores = [(t1 + t2) / 2 for t1, t2 in zip(tabular_scores, text_scores)]
print(f"Ensemble AUC (approx): {np.mean(ensemble_scores):.4f} ± {np.std(ensemble_scores):.4f}")

print("\n" + "="*50)
print("SUMMARY OF RESULTS")
print("="*50)
print(f"Tabular-only Model:  {np.mean(tabular_scores):.4f} ± {np.std(tabular_scores):.4f}")
print(f"Text-only Model:     {np.mean(text_scores):.4f} ± {np.std(text_scores):.4f}")
print(f"Ensemble Model:      {np.mean(ensemble_scores):.4f} ± {np.std(ensemble_scores):.4f}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': ensemble_predictions
})

# Ensure proper format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)
submission = submission.sort_values('request_id')

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission.to_csv('/home/submission/submission_001_baseline.csv', index=False)
print("\nSubmission saved to: /home/submission/submission_001_baseline.csv")

In [4]:
# Debug: Check which features are available in test data
print("Available features in test data:")
print(test_df.columns.tolist())
print(f"\nTotal test columns: {len(test_df.columns)}")

print("\nChecking tabular features availability:")
for feature in tabular_features:
    if feature in test_df.columns:
        print(f"  ✓ {feature}")
    else:
        print(f"  ✗ {feature} - MISSING")

# Let's use only features that are available in both train and test
available_features = [col for col in tabular_features if col in test_df.columns]
print(f"\nAvailable features: {len(available_features)}")
print(available_features)

Available features in test data:
['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_username', 'unix_timestamp_of_request', 'unix_timestamp_of_request_utc', 'has_shroom_flair', 'has_pif_flair', 'full_text']

Total test columns: 20

Checking tabular features availability:
  ✓ requester_account_age_in_days_at_request
  ✓ requester_number_of_comments_at_request
  ✓ requester_number_of_posts_at_request
  ✓ requester_number_of_subreddits_at_request
  ✓ requester_upvotes_minus_downvotes_at_request
  