# Baseline Experiment: TF-IDF + LightGBM

This notebook implements a simple baseline following the seed strategy:
- TF-IDF vectorization of text features (request_title + request_text)
- Basic meta features engineering
- LightGBM classifier with class imbalance handling
- Stratified K-Fold validation

Key features from EDA to leverage:
- requester_user_flair (very strong signal, correlation 0.96 with shroom)
- Text length features
- Cleaned post_was_edited boolean

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load and Prepare Data

In [None]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Features in train: {len(train_df.columns)}")
print(f"Features in test: {len(test_df.columns)}")

# Check target distribution
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Feature Engineering

In [None]:
# Text preprocessing and feature engineering
def engineer_features(df, is_train=True):
    """Engineer features from the raw data"""
    features = pd.DataFrame(index=df.index)
    
    # 1. Text features - combine title and text for full context
    features['full_text'] = df['request_title'].fillna('') + ' ' + df['request_text'].fillna('')
    
    # 2. Text length features
    features['text_length'] = df['request_text'].fillna('').str.len()
    features['title_length'] = df['request_title'].fillna('').str.len()
    features['total_text_length'] = features['text_length'] + features['title_length']
    features['text_word_count'] = df['request_text'].fillna('').str.split().str.len()
    features['title_word_count'] = df['request_title'].fillna('').str.split().str.len()
    
    # 3. User flair (very strong signal from EDA - correlation 0.96 with shroom)
    if 'requester_user_flair' in df.columns:
        flair_dummies = pd.get_dummies(df['requester_user_flair'], prefix='flair')
        features = pd.concat([features, flair_dummies], axis=1)
    else:
        # For test data, add dummy columns if flair is missing
        for flair in ['flair_None', 'flair_PIF', 'flair_shroom']:
            features[flair] = 0
    
    # 4. Clean post_was_edited (fix timestamp issue from EDA)
    if 'post_was_edited' in df.columns:
        features['post_was_edited_clean'] = df['post_was_edited'].apply(
            lambda x: str(x).lower() == 'true'
        ).astype(int)
    
    # 5. User activity features (request-time only to avoid leakage)
    if 'requester_number_of_comments_at_request' in df.columns:
        features['comments_at_request'] = df['requester_number_of_comments_at_request']
    if 'requester_number_of_posts_at_request' in df.columns:
        features['posts_at_request'] = df['requester_number_of_posts_at_request']
    if 'requester_upvotes_plus_downvotes_at_request' in df.columns:
        features['votes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # 6. Account age
    if 'requester_account_age_in_days_at_request' in df.columns:
        features['account_age_days'] = df['requester_account_age_in_days_at_request']
    
    # 7. Time since first post on RAOP
    if 'requester_days_since_first_post_on_raop_at_request' in df.columns:
        features['days_since_first_raop'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # 8. Request text edit aware (alternative text field)
    if 'request_text_edit_aware' in df.columns:
        features['edit_aware_length'] = df['request_text_edit_aware'].fillna('').str.len()
    
    return features

# Engineer features for train and test
train_features = engineer_features(train_df)
test_features = engineer_features(test_df, is_train=False)

print(f"Engineered features shape: {train_features.shape}")
print(f"Feature columns: {train_features.columns.tolist()}")

## TF-IDF Vectorization

In [None]:
# Create TF-IDF features from text
# Use unigrams and bigrams, limit features to manage memory
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features for baseline
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english',
    lowercase=True,
    min_df=2,  # Ignore very rare terms
    max_df=0.95  # Ignore very common terms
)

# Fit TF-IDF on combined train and test text for consistency
combined_text = pd.concat([
    train_features['full_text'],
    test_features['full_text']
], axis=0)

tfidf.fit(combined_text)

# Transform text to TF-IDF features
train_tfidf = tfidf.transform(train_features['full_text'])
test_tfidf = tfidf.transform(test_features['full_text'])

print(f"TF-IDF features shape: {train_tfidf.shape}")

# Convert TF-IDF to DataFrame for easier handling
tfidf_feature_names = [f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
train_tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=tfidf_feature_names, index=train_features.index)
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=tfidf_feature_names, index=test_features.index)

# Combine TF-IDF with other features (drop the raw text column)
train_features_combined = pd.concat([
    train_features.drop('full_text', axis=1),
    train_tfidf_df
], axis=1)

test_features_combined = pd.concat([
    test_features.drop('full_text', axis=1),
    test_tfidf_df
], axis=1)

print(f"Final feature matrix shape: {train_features_combined.shape}")
print(f"Test feature matrix shape: {test_features_combined.shape}")

## Model Training with Cross-Validation

In [None]:
# Prepare data for training
X = train_features_combined
y = train_df['requester_received_pizza'].astype(int)

# Stratified K-Fold to handle class imbalance
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Store predictions for ensembling
train_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features_combined))
fold_scores = []

print(f"Starting {n_splits}-fold cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Calculate scale_pos_weight for handling class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"Scale pos weight: {scale_pos_weight:.2f}")
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # LightGBM parameters - conservative for baseline
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'scale_pos_weight': scale_pos_weight,
        'verbose': -1,
        'seed': RANDOM_SEED
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    train_predictions[valid_idx] = valid_pred
    
    # Calculate AUC for this fold
    fold_auc = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_auc)
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")
    
    # Predict on test set
    test_pred = model.predict(test_features_combined, num_iteration=model.best_iteration)
    test_predictions += test_pred / n_splits

# Overall CV score
cv_score = roc_auc_score(y, train_predictions)
print(f"\n{'='*50}")
print(f"Cross-Validation Results:")
print(f"Mean AUC: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"Overall CV AUC: {cv_score:.4f}")
print(f"{'='*50}")

## Feature Importance Analysis

In [None]:
# Get feature importance from the last fold model
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

# Check importance of flair features (should be high based on EDA)
flair_importance = importance_df[importance_df['feature'].str.contains('flair_')]
print(f"\nFlair features importance:")
print(flair_importance)

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the format matches sample submission
print("Submission shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_001_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Check distribution of predictions
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")