# Baseline Model: LightGBM with Basic Features

This notebook implements a baseline model using:
- Basic text features (length, word count)
- Numerical features with log transforms
- User flair encoding
- TF-IDF for text representation
- LightGBM with class weighting for imbalance

Based on existing findings:
- Class imbalance: 75.2% False, 24.8% True
- User flair is highly predictive ('shroom' = received pizza)
- Numerical features are skewed and need log transforms

In [5]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [6]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
        
train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)
        
test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Check target distribution
print(f"\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

## Feature Engineering

### 1. Text Features

In [None]:
def extract_text_features(df):
    """Extract basic text features"""
    features = pd.DataFrame()
    features['request_id'] = df['request_id']
    
    # Title features
    features['title_length'] = df['request_title'].fillna('').str.len()
    features['title_word_count'] = df['request_title'].fillna('').str.split().str.len()
    
    # Text features (handle missing columns)
    if 'request_text' in df.columns:
        features['text_length'] = df['request_text'].fillna('').str.len()
        features['text_word_count'] = df['request_text'].fillna('').str.split().str.len()
    else:
        features['text_length'] = 0
        features['text_word_count'] = 0
    
    # Edit-aware text features
    features['edit_aware_length'] = df['request_text_edit_aware'].fillna('').str.len()
    features['edit_aware_word_count'] = df['request_text_edit_aware'].fillna('').str.split().str.len()
    
    # Combined text length
    features['total_text_length'] = features['title_length'] + features['text_length']
    features['total_word_count'] = features['title_word_count'] + features['text_word_count']
    
    return features

### 2. Numerical Features with Log Transforms

In [None]:
def extract_numerical_features(df):
    """Extract and transform numerical features"""
    features = pd.DataFrame()
    features['request_id'] = df['request_id']
    
    # Account age features (handle missing columns)
    if 'requester_account_age_in_days_at_request' in df.columns:
        features['account_age_at_request'] = df['requester_account_age_in_days_at_request']
    else:
        features['account_age_at_request'] = 0
        
    if 'requester_account_age_in_days_at_retrieval' in df.columns:
        features['account_age_at_retrieval'] = df['requester_account_age_in_days_at_retrieval']
    else:
        features['account_age_at_retrieval'] = 0
    
    # Upvotes/downvotes features (apply log transform due to skewness)
    if 'requester_upvotes_plus_downvotes_at_request' in df.columns:
        upvotes_plus_downvotes_at_request = df['requester_upvotes_plus_downvotes_at_request'].fillna(0)
        features['log_upvotes_plus_downvotes_at_request'] = np.log1p(upvotes_plus_downvotes_at_request)
    else:
        features['log_upvotes_plus_downvotes_at_request'] = 0
    
    if 'requester_upvotes_plus_downvotes_at_retrieval' in df.columns:
        upvotes_plus_downvotes_at_retrieval = df['requester_upvotes_plus_downvotes_at_retrieval'].fillna(0)
        features['log_upvotes_plus_downvotes_at_retrieval'] = np.log1p(upvotes_plus_downvotes_at_retrieval)
    else:
        features['log_upvotes_plus_downvotes_at_retrieval'] = 0
    
    # Upvotes minus downvotes (net score)
    if 'requester_upvotes_minus_downvotes_at_request' in df.columns:
        features['upvotes_minus_downvotes_at_request'] = df['requester_upvotes_minus_downvotes_at_request'].fillna(0)
    else:
        features['upvotes_minus_downvotes_at_request'] = 0
        
    if 'requester_upvotes_minus_downvotes_at_retrieval' in df.columns:
        features['upvotes_minus_downvotes_at_retrieval'] = df['requester_upvotes_minus_downvotes_at_retrieval'].fillna(0)
    else:
        features['upvotes_minus_downvotes_at_retrieval'] = 0
    
    # Number of comments
    if 'requester_number_of_comments_at_request' in df.columns:
        features['num_comments_at_request'] = df['requester_number_of_comments_at_request'].fillna(0)
    else:
        features['num_comments_at_request'] = 0
        
    if 'requester_number_of_comments_at_retrieval' in df.columns:
        features['num_comments_at_retrieval'] = df['requester_number_of_comments_at_retrieval'].fillna(0)
    else:
        features['num_comments_at_retrieval'] = 0
        
    if 'requester_number_of_comments_in_raop_at_request' in df.columns:
        features['num_comments_in_raop_at_request'] = df['requester_number_of_comments_in_raop_at_request'].fillna(0)
    else:
        features['num_comments_in_raop_at_request'] = 0
        
    if 'requester_number_of_comments_in_raop_at_retrieval' in df.columns:
        features['num_comments_in_raop_at_retrieval'] = df['requester_number_of_comments_in_raop_at_retrieval'].fillna(0)
    else:
        features['num_comments_in_raop_at_retrieval'] = 0
    
    # Number of posts
    if 'requester_number_of_posts_at_request' in df.columns:
        features['num_posts_at_request'] = df['requester_number_of_posts_at_request'].fillna(0)
    else:
        features['num_posts_at_request'] = 0
        
    if 'requester_number_of_posts_at_retrieval' in df.columns:
        features['num_posts_at_retrieval'] = df['requester_number_of_posts_at_retrieval'].fillna(0)
    else:
        features['num_posts_at_retrieval'] = 0
        
    if 'requester_number_of_posts_on_raop_at_request' in df.columns:
        features['num_posts_on_raop_at_request'] = df['requester_number_of_posts_on_raop_at_request'].fillna(0)
    else:
        features['num_posts_on_raop_at_request'] = 0
        
    if 'requester_number_of_posts_on_raop_at_retrieval' in df.columns:
        features['num_posts_on_raop_at_retrieval'] = df['requester_number_of_posts_on_raop_at_retrieval'].fillna(0)
    else:
        features['num_posts_on_raop_at_retrieval'] = 0
    
    # Number of subreddits
    if 'requester_number_of_subreddits_at_request' in df.columns:
        features['num_subreddits_at_request'] = df['requester_number_of_subreddits_at_request'].fillna(0)
    else:
        features['num_subreddits_at_request'] = 0
    
    # Days since first post on RAOP
    if 'requester_days_since_first_post_on_raop_at_request' in df.columns:
        features['days_since_first_raop_post_at_request'] = df['requester_days_since_first_post_on_raop_at_request'].fillna(0)
    else:
        features['days_since_first_raop_post_at_request'] = 0
        
    if 'requester_days_since_first_post_on_raop_at_retrieval' in df.columns:
        features['days_since_first_raop_post_at_retrieval'] = df['requester_days_since_first_post_on_raop_at_retrieval'].fillna(0)
    else:
        features['days_since_first_raop_post_at_retrieval'] = 0
    
    # Request comments and votes (handle missing columns)
    if 'request_number_of_comments_at_retrieval' in df.columns:
        features['request_num_comments'] = df['request_number_of_comments_at_retrieval'].fillna(0)
    else:
        features['request_num_comments'] = 0
        
    if 'number_of_upvotes_of_request_at_retrieval' in df.columns:
        features['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval'].fillna(0)
    else:
        features['request_upvotes'] = 0
        
    if 'number_of_downvotes_of_request_at_retrieval' in df.columns:
        features['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval'].fillna(0)
    else:
        features['request_downvotes'] = 0
        
    features['request_total_votes'] = features['request_upvotes'] + features['request_downvotes']
    
    # Post was edited
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_was_edited'] = 0
    
    return features

### 3. Categorical Features

In [None]:
def extract_categorical_features(df):
    """Extract and encode categorical features"""
    features = pd.DataFrame()
    features['request_id'] = df['request_id']
    
    # User flair (highly predictive according to EDA)
    # 'shroom' = received pizza, 'PIF' = pay-it-forward, None = no pizza
    features['user_flair'] = df['requester_user_flair'].fillna('None')
    
    # Giver username (if known)
    features['giver_known'] = (df['giver_username_if_known'] != 'N/A').astype(int)
    
    return features

train_categorical_features = extract_categorical_features(train_df)
test_categorical_features = extract_categorical_features(test_df)

print("Categorical features extracted:")
print(train_categorical_features['user_flair'].value_counts())

### 4. TF-IDF Features for Text

In [None]:
# Combine text fields for TF-IDF
train_combined_text = (train_df['request_title'].fillna('') + ' ' + 
                        train_df['request_text_edit_aware'].fillna('')).tolist()

test_combined_text = (test_df['request_title'].fillna('') + ' ' + 
                       test_df['request_text_edit_aware'].fillna('')).tolist()

# Create TF-IDF vectorizer (limit features to keep it manageable)
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

# Fit on training data and transform both train and test
tfidf_features_train = tfidf.fit_transform(train_combined_text)
tfidf_features_test = tfidf.transform(test_combined_text)

print(f"TF-IDF features shape: {tfidf_features_train.shape}")

## Combine All Features

In [None]:
# Merge all feature dataframes
train_features = train_text_features.merge(train_numerical_features, on='request_id')
train_features = train_features.merge(train_categorical_features, on='request_id')

test_features = test_text_features.merge(test_numerical_features, on='request_id')
test_features = test_features.merge(test_categorical_features, on='request_id')

# Encode categorical features
le_flair = LabelEncoder()
train_features['user_flair_encoded'] = le_flair.fit_transform(train_features['user_flair'])
test_features['user_flair_encoded'] = le_flair.transform(test_features['user_flair'])

# Drop original categorical columns
train_features = train_features.drop(['user_flair'], axis=1)
test_features = test_features.drop(['user_flair'], axis=1)

# Separate target and features for training
y = train_df['requester_received_pizza'].astype(int)
X = train_features.drop(['request_id'], axis=1)
X_test = test_features.drop(['request_id'], axis=1)

print(f"Final training features shape: {X.shape}")
print(f"Final test features shape: {X_test.shape}")

# Check for any missing values
print(f"\nMissing values in training features: {X.isnull().sum().sum()}")
print(f"Missing values in test features: {X_test.isnull().sum().sum()}")

# Fill any remaining missing values with 0
X = X.fillna(0)
X_test = X_test.fillna(0)

## Model Training with Cross-Validation

In [None]:
# Combine dense features with sparse TF-IDF features
X_dense = X.values
X_test_dense = X_test.values

# Stack dense and sparse features
X_combined = sparse.hstack([sparse.csr_matrix(X_dense), tfidf_features_train])
X_test_combined = sparse.hstack([sparse.csr_matrix(X_test_dense), tfidf_features_test])

print(f"Combined training features shape: {X_combined.shape}")
print(f"Combined test features shape: {X_test_combined.shape}")

# Set up stratified k-fold cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Calculate scale_pos_weight for handling class imbalance
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"Scale pos weight for class imbalance: {scale_pos_weight:.2f}")

# Store predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
cv_scores = []

# Train model on each fold
for fold, (train_idx, valid_idx) in enumerate(skf.split(X_combined, y)):
    print(f"\nTraining fold {fold + 1}/{n_splits}")
    
    # Split data
    X_train_fold = X_combined[train_idx]
    X_valid_fold = X_combined[valid_idx]
    y_train_fold = y.iloc[train_idx]
    y_valid_fold = y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    valid_data = lgb.Dataset(X_valid_fold, label=y_valid_fold)
    
    # Model parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )
    
    # Make predictions
    oof_predictions[valid_idx] = model.predict(X_valid_fold, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_test_combined, num_iteration=model.best_iteration) / n_splits
    
    # Calculate fold score
    fold_score = model.best_score['valid_0']['auc']
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Calculate overall CV score
overall_score = np.mean(cv_scores)
print(f"\nOverall CV AUC: {overall_score:.4f} Â± {np.std(cv_scores):.4f}")

## Generate Submission

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the predictions are in the correct format (0-1 probabilities)
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

print("Submission preview:")
print(submission.head())

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Submission shape: {submission.shape}")