# Baseline Model: Meta Features Only

This notebook implements a simple baseline using only numerical/meta features available in the test set.

**Features used:**
- User activity metrics (at_request only)
- Vote counts (at_request only)
- Temporal features
- Account age features

**Model:** LightGBM with class imbalance handling

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts().to_dict()}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

In [None]:
# Define features available in test set (17 features)
# Excluding: request_id, requester_username, giver_username_if_known, requester_subreddits_at_request (high cardinality)
# Using only numerical features for this baseline

meta_features = [
    # User activity (at_request only)
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    
    # Vote counts (at_request only)
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    
    # Temporal features
    'unix_timestamp_of_request',
    'unix_timestamp_of_request_utc',
    
    # Account age
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request'
]

print(f"Using {len(meta_features)} meta features:")
for feat in meta_features:
    print(f"  - {feat}")

In [None]:
# Engineer additional features from temporal data
print("Engineering temporal features...")

train_df['request_datetime'] = pd.to_datetime(train_df['unix_timestamp_of_request_utc'], unit='s')
test_df['request_datetime'] = pd.to_datetime(test_df['unix_timestamp_of_request_utc'], unit='s')

# Extract hour and day of week
train_df['request_hour'] = train_df['request_datetime'].dt.hour
test_df['request_hour'] = test_df['request_datetime'].dt.hour

train_df['request_dayofweek'] = train_df['request_datetime'].dt.dayofweek
test_df['request_dayofweek'] = test_df['request_datetime'].dt.dayofweek

# Add engineered features to feature list
engineered_features = ['request_hour', 'request_dayofweek']
all_features = meta_features + engineered_features

print(f"Total features: {len(all_features)}")

# Check for missing values
print("\nMissing values in training data:")
print(train_df[all_features].isnull().sum().sum())

print("\nMissing values in test data:")
print(test_df[all_features].isnull().sum().sum())

In [None]:
# Prepare data for modeling
X = train_df[all_features].copy()
y = train_df['requester_received_pizza'].astype(int).values
X_test = test_df[all_features].copy()

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

# Handle any missing values (fill with median)
for col in all_features:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)
    X_test[col].fillna(median_val, inplace=True)

In [None]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Model parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'class_weight': 'balanced'  # Handle class imbalance
}

print(f"Training with {n_folds}-fold stratified CV...")
print(f"Model parameters: {params}")

In [None]:
# Train with cross-validation
fold_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / n_folds
    
    # Calculate score
    fold_score = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_score)
    print(f"Fold {fold + 1} ROC-AUC: {fold_score:.4f}")

# Overall CV score
cv_score = roc_auc_score(y, oof_predictions)
print(f"\n{'='*50}")
print(f"Cross-Validation Results:")
print(f"Mean ROC-AUC: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"OOF ROC-AUC: {cv_score:.4f}")
print(f"Fold scores: {fold_scores}")
print(f"{'='*50}")

In [None]:
# Feature importance
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 features by importance:")
print(importance_df.head(10))

In [None]:
# Save predictions for submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission_path = '/home/submission/submission_001_baseline_meta_only.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nSubmission saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")