# Baseline Experiment: Tabular Features Only with LightGBM

This notebook implements a baseline model using only tabular features with LightGBM.
Following the strategy: start with gradient boosting on tabular features alone as a strong baseline.

In [3]:
# Load data and check structure
print("Loading data...")
train_path = "/home/data/train.json"
test_path = "/home/data/test.json"

with open(train_path, 'r') as f:
    train_data = json.load(f)
    
with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].mean():.3f}")
print(f"Train columns: {list(train_df.columns)}")

Loading data...
Train shape: (2878, 32)
Test shape: (1162, 17)
Target distribution: 0.248
Train columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pi

In [6]:
# Use "at_request" features since "at_retrieval" features are not in test data
# This is a critical finding from data inspection

numerical_features = [
    'requester_number_of_posts_on_raop_at_request',  # Use at_request version
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_subreddits_at_request'
]

# Filter to only features that exist in both train and test
available_numerical_features = [f for f in numerical_features if f in test_df.columns]
print(f"Available numerical features: {len(available_numerical_features)}")
print(available_numerical_features)

# Text features - request_title is available in both
text_features_available = ['request_title']  # request_text is NOT in test data

# Create text length features
for text_col in text_features_available:
    train_df[f'{text_col}_length'] = train_df[text_col].str.len()
    test_df[f'{text_col}_length'] = test_df[text_col].str.len()

text_length_features = [f'{col}_length' for col in text_features_available]

# No categorical features available in test (requester_user_flair missing)
categorical_features = []

# Combine all features
feature_columns = available_numerical_features + text_length_features

print(f"\nUsing {len(feature_columns)} features: {feature_columns}")

# Prepare data
X = train_df[feature_columns].copy()
y = train_df['requester_received_pizza'].astype(int)
X_test = test_df[feature_columns].copy()

print(f"Final feature matrix shape: {X.shape}")
print(f"Test feature matrix shape: {X_test.shape}")

# Check for missing values
print(f"\nMissing values in train: {X.isnull().sum().sum()}")
print(f"Missing values in test: {X_test.isnull().sum().sum()}")

Available numerical features: 9
['requester_number_of_posts_on_raop_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_subreddits_at_request']

Using 10 features: ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'request_title_length']
Final feature matrix shape: (2878, 10)
Test feature matrix shape: (1162, 10)

Miss

In [7]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Model parameters (conservative baseline)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'num_threads': 4
}

# Store predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
cv_scores = []

print(f"Starting {n_folds}-fold cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )
    
    # Predictions
    oof_predictions[valid_idx] = model.predict(X_valid, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_test, num_iteration=model.best_iteration) / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_valid, oof_predictions[valid_idx])
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Overall CV score
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)
print(f"\nCV Score: {cv_mean:.4f} ± {cv_std:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("\nTop 10 Features by Importance:")
print(feature_importance.head(10))

Starting 5-fold cross-validation...

Fold 1/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	valid's auc: 0.639093
Fold 1 AUC: 0.6391

Fold 2/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	valid's auc: 0.59141
Fold 2 AUC: 0.5914

Fold 3/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[8]	valid's auc: 0.624114
Fold 3 AUC: 0.6241

Fold 4/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid's auc: 0.585252
Fold 4 AUC: 0.5853

Fold 5/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	valid's auc: 0.618784
Fold 5 AUC: 0.6188

CV Score: 0.6117 ± 0.0203

Top 10 Features by Importance:
                                             feature  importance
4       requester_upvotes_minus_downvotes_at_request  605.061149
6           requester_account_age_in_days_at_request  587.366292
3               requester_number_of_posts_at_request  513.558150
5        requester_upvotes_plus_downvotes_at_request  513.403731
9                               request_title_length  476.606741
2            requester_number_of_comments_at_request  466.445330
7  requester_days_since_first_post_on_raop_at_req...  338.616499
8          requester_number_of_subredd

In [8]:
# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure proper format (0/1 probabilities)
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

submission_path = "/home/submission/submission_001_baseline_tabular.csv"
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Submission predictions range: {submission['requester_received_pizza'].min():.4f} to {submission['requester_received_pizza'].max():.4f}")

# Show first few predictions
print("\nFirst 5 predictions:")
print(submission.head())

Submission saved to: /home/submission/submission_001_baseline_tabular.csv
Submission shape: (1162, 2)
Submission predictions range: 0.1789 to 0.4598

First 5 predictions:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.254531
1   t3_roiuw                  0.254034
2   t3_mjnbq                  0.244290
3   t3_t8wd1                  0.232418
4  t3_1m4zxu                  0.266252
