# AutoGluon Multimodal Model

Using AutoGluon's TabularPredictor with multimodal features:
- Text features: request_title, request_text, request_text_edit_aware
- Categorical: requester_user_flair
- Numeric: all other features
- Handles class imbalance automatically
- Uses ensemble of multiple models including text transformers

In [3]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Create experiments directory
Path('/home/code/experiments').mkdir(parents=True, exist_ok=True)

print("Loading data...")

Loading data...


In [None]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = [json.loads(line) for line in f]

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Check target distribution
target = 'requester_received_pizza'
print(f"\nTarget distribution:")
print(train_df[target].value_counts())
print(f"Positive rate: {train_df[target].mean():.3f}")

In [None]:
# Install AutoGluon if not available
try:
    import autogluon
    print(f"AutoGluon version: {autogluon.__version__}")
except ImportError:
    print("Installing AutoGluon...")
    !pip install -q autogluon
    import autogluon
    print(f"AutoGluon installed: {autogluon.__version__}")

In [None]:
from autogluon.tabular import TabularPredictor

# Prepare data - AutoGluon can handle text directly
# Keep all features, let AutoGluon handle the preprocessing

# Define feature columns (exclude ID columns and target)
exclude_cols = ['request_id', 'requester_received_pizza', 'giver_username_if_known']
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

print(f"Using {len(feature_cols)} features")
print(f"Feature types:")
for col in feature_cols[:10]:
    print(f"  {col}: {train_df[col].dtype}")

In [None]:
# Configure AutoGluon for class imbalance
# Use 'best_quality' preset for maximum performance
# Set time limit to avoid running too long

predictor = TabularPredictor(
    label=target,
    problem_type='binary',
    eval_metric='roc_auc',  # Good for imbalanced classification
    path='/home/code/experiments/autogluon_models'
).fit(
    train_data=train_df[feature_cols + [target]],
    presets='best_quality',
    time_limit=1200,  # 20 minutes
    hyperparameters={
        'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}],
        'CAT': {},
        'XGB': {},
        'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}],
        'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}],
        'LR': [{}],
    },
    verbosity=2
)

In [None]:
# Get cross-validation scores
cv_results = predictor.fit_summary()
print("\nCross-validation results:")
print(f"Best model: {cv_results['model_best']}")
print(f"Best validation score: {cv_results['val_score']:.4f}")

# Get leaderboard
leaderboard = predictor.leaderboard(silent=True)
print("\nModel leaderboard:")
print(leaderboard[['model', 'score_val', 'pred_time_val']].head())

In [None]:
# Make predictions on test set
test_predictions = predictor.predict_proba(test_df[feature_cols])[[1]]  # Get probability of positive class

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Check prediction distribution
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")

# Save submission
submission_path = '/home/submission/submission_autogluon.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")