In [7]:
import pandas as pd
import numpy as np
# Load the data
df = pd.read_csv('data/train.csv')

# Remove ID column if it exists
if 'id' in df.columns:
    df = df.drop('id', axis=1)

# Convert date column to datetime and extract d,m,y
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month 
    df['year'] = df['date'].dt.year

df.dropna(inplace=True)


In [8]:
from autogluon.tabular import TabularPredictor

# Prepare train/test split based on time
train_data = df[df['year'] <= 2015].copy()
test_data = df[df['year'] > 2015].copy()

# Convert date to string (AutoGluon requirement)
train_data['date'] = train_data['date'].astype(str)
test_data['date'] = test_data['date'].astype(str)

# Initialize and train AutoGluon predictor
predictor = TabularPredictor(
    label='num_sold',
    eval_metric='root_mean_squared_error',
    path='agModels'  # Directory to store models
).fit(
    train_data,
    time_limit=3600,  # Time limit in seconds
    presets='best_quality',  # Or 'high_quality' for faster training
    excluded_model_types=['KNN']  # Optional: exclude specific models
)

# Evaluate all models
predictor.leaderboard(test_data, silent=True)

# Get feature importance
predictor.feature_importance(test_data)

# Make predictions
predictions = predictor.predict(test_data)

# Calculate metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print("\nTest Set Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(test_data['num_sold'], predictions)):.2f}")
print(f"MAE: {mean_absolute_error(test_data['num_sold'], predictions):.2f}")
print(f"R2: {r2_score(test_data['num_sold'], predictions):.2f}")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.3
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:59:33 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       1.35 GB / 8.00 GB (16.8%)
Disk Space Avail:   124.31 GB / 228.27 GB (54.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked ove