# AutoGluon Multimodal Model

Using AutoGluon's TabularPredictor with multimodal features:
- Text features: request_title, request_text, request_text_edit_aware
- Categorical: requester_user_flair
- Numeric: all other features
- Handles class imbalance automatically
- Uses ensemble of multiple models including text transformers

In [7]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Create experiments directory
Path('/home/code/experiments').mkdir(parents=True, exist_ok=True)

print("Loading data...")

Loading data...


In [8]:
# Load training data - file appears to be a JSON array, not line-delimited
import json

with open('/home/data/train.json', 'r') as f:
    # Try to load as a single JSON array
    try:
        train_data = json.load(f)
    except:
        # If that fails, try line-delimited
        f.seek(0)
        train_data = [json.loads(line) for line in f if line.strip()]

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")

# Load test data
with open('/home/data/test.json', 'r') as f:
    try:
        test_data = json.load(f)
    except:
        f.seek(0)
        test_data = [json.loads(line) for line in f if line.strip()]

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Check target distribution
target = 'requester_received_pizza'
print(f"\nTarget distribution:")
print(train_df[target].value_counts())
print(f"Positive rate: {train_df[target].mean():.3f}")

Training data shape: (4040, 32)
Test data shape: (1631, 17)

Target distribution:
requester_received_pizza
False    3046
True      994
Name: count, dtype: int64
Positive rate: 0.246


In [9]:
# Define features - only use columns available in BOTH train and test
# Test set only has "at_request" features, not "at_retrieval" features
feature_cols = [
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request', 
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'unix_timestamp_of_request',
    # Text features
    'request_text_edit_aware',
    'request_title',
    'requester_subreddits_at_request'
]

# Target column
target = 'requester_received_pizza'

print(f"Using {len(feature_cols)} features")
print("Features:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}: {train_df[col].dtype}")

Using 13 features
Features:
  1. requester_account_age_in_days_at_request: float64
  2. requester_days_since_first_post_on_raop_at_request: float64
  3. requester_number_of_comments_at_request: int64
  4. requester_number_of_comments_in_raop_at_request: int64
  5. requester_number_of_posts_at_request: int64
  6. requester_number_of_posts_on_raop_at_request: int64
  7. requester_number_of_subreddits_at_request: int64
  8. requester_upvotes_minus_downvotes_at_request: int64
  9. requester_upvotes_plus_downvotes_at_request: int64
  10. unix_timestamp_of_request: float64
  11. request_text_edit_aware: object
  12. request_title: object
  13. requester_subreddits_at_request: object


In [10]:
# Check data types for the selected features
print("Data types:")
print(train_df[feature_cols].dtypes.value_counts())

# Check for object columns that might contain lists
object_cols = train_df[feature_cols].select_dtypes(include=['object']).columns
print(f"\nObject columns ({len(object_cols)}):")
for col in object_cols:
    sample_val = train_df[col].iloc[0]
    print(f"  {col}: {type(sample_val)} - {str(sample_val)[:100]}...")
    
# Convert list columns to string representation
list_col = 'requester_subreddits_at_request'
if list_col in feature_cols:
    train_df[list_col] = train_df[list_col].apply(lambda x: ','.join(x) if isinstance(x, list) else str(x))
    test_df[list_col] = test_df[list_col].apply(lambda x: ','.join(x) if isinstance(x, list) else str(x))
    print(f"\nConverted {list_col} from list to string")

Data types:
int64      7
float64    3
object     3
Name: count, dtype: int64

Object columns (3):
  request_text_edit_aware: <class 'str'> - Hi I am in need of food for my 4 children we are a military family that has really hit hard times an...
  request_title: <class 'str'> - Request Colorado Springs Help Us Please...
  requester_subreddits_at_request: <class 'list'> - []...

Converted requester_subreddits_at_request from list to string


In [11]:
# Import AutoGluon
from autogluon.tabular import TabularPredictor

print("AutoGluon imported successfully")

Data types:
int64      7
float64    3
object     3
Name: count, dtype: int64

Object columns (3):
  request_text_edit_aware: <class 'str'> - Hi I am in need of food for my 4 children we are a military family that has really hit hard times an...
  request_title: <class 'str'> - Request Colorado Springs Help Us Please...
  requester_subreddits_at_request: <class 'str'> - ...

No list columns found


In [14]:
# Configure AutoGluon for class imbalance
# Use 'medium_quality_faster_train' preset for faster training
# Set time limit to avoid running too long

predictor = TabularPredictor(
    label=target,
    problem_type='binary',
    eval_metric='roc_auc',  # Good for imbalanced classification
    path='/home/code/experiments/autogluon_models_v2'
).fit(
    train_data=train_df[feature_cols + [target]],
    presets='medium_quality_faster_train',
    time_limit=600,  # 10 minutes
    verbosity=2
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.


Verbosity: 2 (Standard Logging)


AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #21~24.04.1-Ubuntu SMP Fri Oct 17 00:56:30 UTC 2025
CPU Count:          12
Pytorch Version:    2.9.1+cu128
CUDA Version:       12.8
GPU Memory:         GPU 0: 79.25/79.25 GB
Total GPU Memory:   Free: 79.25 GB, Allocated: 0.00 GB, Total: 79.25 GB
GPU Count:          1
Memory Avail:       160.18 GB / 167.04 GB (95.9%)
Disk Space Avail:   938.67 GB / 3389.36 GB (27.7%)


Presets specified: ['medium_quality_faster_train']


Using hyperparameters preset: hyperparameters='default'


Beginning AutoGluon training ... Time limit = 600s


AutoGluon will save models to "/home/code/experiments/autogluon_models_v2"


Train Data Rows:    4040


Train Data Columns: 13


Label Column:       requester_received_pizza


Problem Type:       binary


Preprocessing data ...


Selected class <--> label mapping:  class 1 = True, class 0 = False


Using Feature Generators to preprocess the data ...


Fitting AutoMLPipelineFeatureGenerator...


	Available Memory:                    164028.33 MB


	Train Data (Original)  Memory Usage: 3.58 MB (0.0% of available memory)


	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.


	Stage 1 Generators:


		Fitting AsTypeFeatureGenerator...


	Stage 2 Generators:


		Fitting FillNaFeatureGenerator...


	Stage 3 Generators:


		Fitting IdentityFeatureGenerator...


		Fitting CategoryFeatureGenerator...


			Fitting CategoryMemoryMinimizeFeatureGenerator...


		Fitting TextSpecialFeatureGenerator...


			Fitting BinnedFeatureGenerator...


			Fitting DropDuplicatesFeatureGenerator...


		Fitting TextNgramFeatureGenerator...


			Fitting CountVectorizer for text features: ['request_text_edit_aware', 'request_title']


			CountVectorizer fit with vocabulary size = 2442


	Stage 4 Generators:


		Fitting DropUniqueFeatureGenerator...


	Stage 5 Generators:


		Fitting DropDuplicatesFeatureGenerator...


	Types of features in original data (raw dtype, special dtypes):


		('float', [])        : 3 | ['requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'unix_timestamp_of_request']


		('int', [])          : 7 | ['requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', ...]


		('object', [])       : 1 | ['requester_subreddits_at_request']


		('object', ['text']) : 2 | ['request_text_edit_aware', 'request_title']


	Types of features in processed data (raw dtype, special dtypes):


		('category', [])                    :    1 | ['requester_subreddits_at_request']


		('category', ['text_as_category'])  :    2 | ['request_text_edit_aware', 'request_title']


		('float', [])                       :    3 | ['requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'unix_timestamp_of_request']


		('int', [])                         :    7 | ['requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', ...]


		('int', ['binned', 'text_special']) :   56 | ['request_text_edit_aware.char_count', 'request_text_edit_aware.word_count', 'request_text_edit_aware.capital_ratio', 'request_text_edit_aware.lower_ratio', 'request_text_edit_aware.digit_ratio', ...]


		('int', ['text_ngram'])             : 2430 | ['__nlp__.10', '__nlp__.100', '__nlp__.11', '__nlp__.12', '__nlp__.15', ...]


	5.0s = Fit runtime


	13 features in original data used to generate 2499 features in processed data.


	Train Data (Processed) Memory Usage: 19.26 MB (0.0% of available memory)


Data preprocessing and feature engineering runtime = 5.19s ...


AutoGluon will gauge predictive performance using evaluation metric: 'roc_auc'


	This metric expects predicted probabilities rather than predicted class labels, so you'll need to use predict_proba() instead of predict()


	To change this, specify the eval_metric parameter of Predictor()


Automatically generating train/validation split with holdout_frac=0.1238, Train Rows: 3539, Val Rows: 501


User-specified model hyperparameters to be fit:
{
	'NN_TORCH': [{}],
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': None}}],
	'CAT': [{}],
	'XGB': [{}],
	'FASTAI': [{}],
	'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
	'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regressi

Fitting 11 L1 models, fit_strategy="sequential" ...


Fitting model: LightGBMXT ... Training model for up to 594.81s of the 594.81s of remaining time.


	Fitting with cpus=6, gpus=0, mem=0.5/160.0 GB


	0.7086	 = Validation score   (roc_auc)


	2.06s	 = Training   runtime


	0.01s	 = Validation runtime


Fitting model: LightGBM ... Training model for up to 592.73s of the 592.73s of remaining time.


	Fitting with cpus=6, gpus=0, mem=0.5/160.0 GB


	0.6986	 = Validation score   (roc_auc)


	2.06s	 = Training   runtime


	0.01s	 = Validation runtime


Fitting model: RandomForestGini ... Training model for up to 590.64s of the 590.64s of remaining time.


	Fitting with cpus=12, gpus=0, mem=0.0/160.0 GB


	0.6849	 = Validation score   (roc_auc)


	1.97s	 = Training   runtime


	0.08s	 = Validation runtime


Fitting model: RandomForestEntr ... Training model for up to 588.55s of the 588.55s of remaining time.


	Fitting with cpus=12, gpus=0, mem=0.0/160.0 GB


	0.6932	 = Validation score   (roc_auc)


	1.89s	 = Training   runtime


	0.08s	 = Validation runtime


Fitting model: CatBoost ... Training model for up to 586.54s of the 586.54s of remaining time.


	Fitting with cpus=6, gpus=0, mem=2.6/160.0 GB


	0.6874	 = Validation score   (roc_auc)


	7.98s	 = Training   runtime


	0.36s	 = Validation runtime


Fitting model: ExtraTreesGini ... Training model for up to 578.19s of the 578.19s of remaining time.


	Fitting with cpus=12, gpus=0, mem=0.0/159.9 GB


	0.6643	 = Validation score   (roc_auc)


	2.02s	 = Training   runtime


	0.08s	 = Validation runtime


Fitting model: ExtraTreesEntr ... Training model for up to 576.04s of the 576.04s of remaining time.


	Fitting with cpus=12, gpus=0, mem=0.0/159.9 GB


	0.6518	 = Validation score   (roc_auc)


	1.97s	 = Training   runtime


	0.08s	 = Validation runtime


Fitting model: NeuralNetFastAI ... Training model for up to 573.95s of the 573.95s of remaining time.


	Fitting with cpus=6, gpus=0, mem=0.2/159.9 GB


No improvement since epoch 7: early stopping


	0.6463	 = Validation score   (roc_auc)


	3.69s	 = Training   runtime


	0.01s	 = Validation runtime


Fitting model: XGBoost ... Training model for up to 570.23s of the 570.23s of remaining time.


	Fitting with cpus=6, gpus=0, mem=1.0/159.9 GB


	0.6949	 = Validation score   (roc_auc)


	5.06s	 = Training   runtime


	0.03s	 = Validation runtime


Fitting model: NeuralNetTorch ... Training model for up to 565.13s of the 565.13s of remaining time.


	Fitting with cpus=6, gpus=0, mem=0.1/159.9 GB


	0.6563	 = Validation score   (roc_auc)


	7.89s	 = Training   runtime


	0.03s	 = Validation runtime


Fitting model: LightGBMLarge ... Training model for up to 557.20s of the 557.20s of remaining time.


	Fitting with cpus=6, gpus=0, mem=2.0/159.9 GB


	0.6923	 = Validation score   (roc_auc)


	5.13s	 = Training   runtime


	0.02s	 = Validation runtime


Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 552.04s of remaining time.


	Fitting 1 model on all data | Fitting with cpus=12, gpus=0, mem=0.0/159.9 GB


	Ensemble Weights: {'LightGBMXT': 0.56, 'RandomForestEntr': 0.2, 'XGBoost': 0.12, 'ExtraTreesGini': 0.08, 'NeuralNetFastAI': 0.04}


	0.7157	 = Validation score   (roc_auc)


	0.06s	 = Training   runtime


	0.0s	 = Validation runtime


AutoGluon training complete, total runtime = 48.1s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 2228.5 rows/s (501 batch size)


TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/code/experiments/autogluon_models_v2")


In [None]:
# Generate predictions on test set
test_predictions = predictor.predict_proba(test_df[feature_cols])

# Get the probability for the positive class (True)
if isinstance(test_predictions, pd.DataFrame):
    # AutoGluon returns a DataFrame with columns for each class
    positive_class = predictor.positive_class
    test_pred_proba = test_predictions[positive_class]
else:
    # If it's a Series or array
    test_pred_proba = test_predictions

print(f"Test predictions shape: {test_pred_proba.shape}")
print(f"Test predictions sample:\n{test_pred_proba.head()}")

# Get leaderboard to see model performance
leaderboard = predictor.leaderboard(silent=True)
print("\nLeaderboard (top 5 models):")
print(leaderboard.head())

# Get the best model's validation score
best_model_score = leaderboard.iloc[0]['score_val']
print(f"\nBest model validation score: {best_model_score:.4f}")

# Create submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_pred_proba
})

print(f"\nSubmission shape: {submission_df.shape}")
print(f"Submission sample:\n{submission_df.head()}")

# Save submission
submission_path = '/home/submission/autogluon_submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

In [None]:
# Make predictions on test set
test_predictions = predictor.predict_proba(test_df[feature_cols])[[1]]  # Get probability of positive class

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Check prediction distribution
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")

# Save submission
submission_path = '/home/submission/submission_autogluon.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

In [16]:
# Debug: Check what columns are available in test set
print("Train columns:", train_df.columns.tolist())
print("\nTest columns:", test_df.columns.tolist())
print("\nMissing from test:", set(feature_cols) - set(test_df.columns))
print("\nExtra in test:", set(test_df.columns) - set(train_df.columns))

Train columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', '