# Baseline Model: LightGBM with Text and Metadata Features

This notebook creates a baseline model for the Random Acts of Pizza competition using LightGBM with both text and metadata features.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

## Load Data

In [None]:
# Text features - combine title and text (use edit_aware version since test only has that)
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Basic text statistics
def extract_text_features(df):
    df['text_length'] = df['combined_text'].str.len()
    df['word_count'] = df['combined_text'].str.split().str.len()
    df['exclamation_count'] = df['combined_text'].str.count('!')
    df['question_count'] = df['combined_text'].str.count('\?')
    df['caps_count'] = df['combined_text'].str.count('[A-Z]')
    df['caps_ratio'] = df['caps_count'] / (df['text_length'] + 1)
    return df

train_df = extract_text_features(train_df)
test_df = extract_text_features(test_df)

print("Text features extracted")

# Metadata features - select relevant numeric features (only those available at request time)
numeric_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_request'
]

# Handle missing values
for col in numeric_features:
    if col in train_df.columns:
        train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
        
        # Fill missing values with median
        median_val = train_df[col].median()
        train_df[col] = train_df[col].fillna(median_val)
        test_df[col] = test_df[col].fillna(median_val)
    else:
        print(f"Warning: {col} not found in data")
        numeric_features.remove(col)

print(f"Numeric features processed: {len(numeric_features)}")

In [None]:
# Categorical features (only those available in test set)
categorical_features = []

# Check which categorical features are available
available_categorical = []
for col in ['requester_user_flair', 'post_was_edited']:
    if col in train_df.columns and col in test_df.columns:
        available_categorical.append(col)

if available_categorical:
    for col in available_categorical:
        train_df[col] = train_df[col].fillna('missing')
        test_df[col] = test_df[col].fillna('missing')
        
        # Simple label encoding
        combined = pd.concat([train_df[col], test_df[col]])
        mapping = {val: idx for idx, val in enumerate(combined.unique())}
        train_df[col + '_encoded'] = train_df[col].map(mapping)
        test_df[col + '_encoded'] = test_df[col].map(mapping)
    
    categorical_features = [col + '_encoded' for col in available_categorical]

print(f"Categorical features encoded: {len(categorical_features)}")

In [None]:
# Metadata features - select relevant numeric features
numeric_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

# Handle missing values
for col in numeric_features:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    
    # Fill missing values with median
    median_val = train_df[col].median()
    train_df[col] = train_df[col].fillna(median_val)
    test_df[col] = test_df[col].fillna(median_val)

print("Numeric features processed")

In [None]:
# Combine all features
feature_cols = numeric_features + categorical_features + \
               ['text_length', 'word_count', 'exclamation_count', 'question_count', 'caps_count', 'caps_ratio']

X_train = pd.concat([train_df[feature_cols].reset_index(drop=True), tfidf_train_df], axis=1)
X_test = pd.concat([test_df[feature_cols].reset_index(drop=True), tfidf_test_df], axis=1)

y_train = train_df['requester_received_pizza'].astype(int)

print(f"Final training shape: {X_train.shape}")
print(f"Final test shape: {X_test.shape}")
print(f"Feature columns: {len(feature_cols)} + {tfidf_train_df.shape[1]} TF-IDF features")

## TF-IDF Features

In [None]:
# Create TF-IDF features for text
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)

# Fit on training text
tfidf_train = vectorizer.fit_transform(train_df['combined_text'])
tfidf_test = vectorizer.transform(test_df['combined_text'])

print(f"TF-IDF features shape: {tfidf_train.shape}")

# Convert to DataFrame for easier handling
tfidf_train_df = pd.DataFrame(
    tfidf_train.toarray(),
    columns=[f'tfidf_{i}' for i in range(tfidf_train.shape[1])]
)

tfidf_test_df = pd.DataFrame(
    tfidf_test.toarray(),
    columns=[f'tfidf_{i}' for i in range(tfidf_test.shape[1])]
)

## Prepare Final Feature Matrix

In [None]:
# Combine all features
feature_cols = numeric_features + [col + '_encoded' for col in categorical_features if col in train_df.columns] + \
               ['text_length', 'word_count', 'exclamation_count', 'question_count', 'caps_count', 'caps_ratio']

X_train = pd.concat([train_df[feature_cols].reset_index(drop=True), tfidf_train_df], axis=1)
X_test = pd.concat([test_df[feature_cols].reset_index(drop=True), tfidf_test_df], axis=1)

y_train = train_df['requester_received_pizza'].astype(int)

print(f"Final training shape: {X_train.shape}")
print(f"Final test shape: {X_test.shape}")
print(f"Feature columns: {len(feature_cols)} + {tfidf_train_df.shape[1]} TF-IDF features")

## Model Training with Cross-Validation

In [None]:
# Stratified K-Fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

cv_scores = []
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

feature_names = X_train.columns.tolist()

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold + 1}/5")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # LightGBM dataset
    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val)
    
    # Parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': SEED
    }
    
    # Handle class imbalance
    scale_pos_weight = (y_tr == 0).sum() / (y_tr == 1).sum()
    params['scale_pos_weight'] = scale_pos_weight
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store results
    oof_predictions[valid_idx] = val_pred
    test_predictions += test_pred / 5
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Overall CV score
overall_score = roc_auc_score(y_train, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Feature Importance

In [None]:
# Get feature importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20))

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure proper format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)

print("Submission file created:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction distribution:")
print(submission['requester_received_pizza'].describe())