# Baseline Experiment: Tabular Features Only with LightGBM

This notebook implements a baseline model using only tabular features with LightGBM.
Following the strategy: start with gradient boosting on tabular features alone as a strong baseline.

In [None]:
# Load data and check structure
print("Loading data...")
train_path = "/home/data/train.json"
test_path = "/home/data/test.json"

with open(train_path, 'r') as f:
    train_data = json.load(f)
    
with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].mean():.3f}")
print(f"Train columns: {list(train_df.columns)}")

In [None]:
# Define tabular features based on EDA insights
# Focus on strong predictive signals identified in eda.ipynb

# Numerical features with high correlation to target
numerical_features = [
    'requester_number_of_posts_on_raop_at_retrieval',  # corr 0.46 - strongest predictor
    'request_number_of_comments_at_retrieval',         # corr 0.29 - strong predictor
    'requester_number_of_comments_in_raop_at_retrieval',
    'requester_number_of_comments_at_retrieval',
    'requester_number_of_posts_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'requester_upvotes_minus_downvotes_at_retrieval',
    'requester_upvotes_plus_downvotes_at_retrieval',
    'requester_account_age_in_days_at_retrieval',
    'requester_days_since_first_post_on_raop_at_retrieval'
]

# Create text length features (simple but effective)
train_df['title_length'] = train_df['request_title'].str.len()
train_df['text_length'] = train_df['request_text'].str.len()
test_df['title_length'] = test_df['request_title'].str.len()
test_df['text_length'] = test_df['request_text'].str.len()

text_length_features = ['title_length', 'text_length']

# Handle user flair (75% missing - treat as categorical with missing as separate category)
train_df['requester_user_flair'] = train_df['requester_user_flair'].fillna('missing')
test_df['requester_user_flair'] = test_df['requester_user_flair'].fillna('missing')

# Encode categorical features
categorical_features = ['requester_user_flair']

# Combine all features
feature_columns = numerical_features + text_length_features + categorical_features

print(f"Using {len(feature_columns)} features: {feature_columns}")

# Prepare data
X = train_df[feature_columns].copy()
y = train_df['requester_received_pizza'].astype(int)
X_test = test_df[feature_columns].copy()

# Handle categorical encoding
for col in categorical_features:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

print(f"Final feature matrix shape: {X.shape}")

In [None]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Model parameters (conservative baseline)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'num_threads': 4
}

# Store predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
cv_scores = []

print(f"Starting {n_folds}-fold cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )
    
    # Predictions
    oof_predictions[valid_idx] = model.predict(X_valid, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_test, num_iteration=model.best_iteration) / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_valid, oof_predictions[valid_idx])
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Overall CV score
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)
print(f"\nCV Score: {cv_mean:.4f} Â± {cv_std:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("\nTop 10 Features by Importance:")
print(feature_importance.head(10))

In [None]:
# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure proper format (0/1 probabilities)
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

submission_path = "/home/submission/submission_001_baseline_tabular.csv"
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Submission predictions range: {submission['requester_received_pizza'].min():.4f} to {submission['requester_received_pizza'].max():.4f}")

# Show first few predictions
print("\nFirst 5 predictions:")
print(submission.head())