In [1]:
# 1. Import library
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings("ignore", message=".*load_learner.*insecure pickle.*")

# 2. Load the data
train_df = pd.read_csv('../Data/processed/0427_01/train_2025.csv') 
test_df = pd.read_csv('../Data/processed/0427_01/test_2025.csv') 

train_df.drop(columns="claim_number", inplace=True)
test_id = test_df['claim_number']
test_df.drop(columns=["claim_number"], inplace=True)

In [3]:
drop_columns = ['median_home_value', 
                'occupied_housing_units', 
                'housing_units', 
                'population_density', 
                'population', 
                'vacancy_rate', 
                'pop_per_occupied_housing_units', 
                'home_value_v_median_household_income', 
                'log_occupied_housing_per_sqmi']

train_df.drop(columns = drop_columns, inplace=True)
test_df.drop(columns = drop_columns, inplace=True)

In [4]:
timestamp = datetime.now().strftime("%m%d_%H%M")


predictor = TabularPredictor(
    label="fraud",
    eval_metric="f1",
    problem_type="binary",
    path=f"../AutogluonModels/Model_{timestamp}"
).fit(
    train_data=train_df,
    presets="best",  # Or use "high_quality_fast_inference_only_refit" if you want lighter models
    holdout_frac=0.2,
    hyperparameters={
        'GBM': { 'ag_args_fit': {'num_gpus': 1} },            # LightGBM
        'CAT': { 'ag_args_fit': {'num_gpus': 1} },            # CatBoost
        'XGB': { 'ag_args_fit': {'num_gpus': 1} },            # XGBoost
    },
    included_model_types=["GBM", "CAT", "XGB"],  # Only include GBMs
    verbosity=2
)


Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.12
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          16
Memory Avail:       15.63 GB / 31.93 GB (49.0%)
Disk Space Avail:   284.22 GB / 935.97 GB (30.4%)
Presets specified: ['best']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 90

In [8]:
temp = predictor.leaderboard()
temp

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost_BAG_L2,0.132773,f1,1.096547,339.533042,0.333666,76.490993,2,True,7
1,WeightedEnsemble_L3,0.132773,f1,1.10155,340.257819,0.005003,0.724777,3,True,8
2,XGBoost_BAG_L1,0.116592,f1,0.192348,50.764673,0.192348,50.764673,1,True,3
3,WeightedEnsemble_L2,0.116592,f1,0.196348,51.108705,0.004,0.344032,2,True,4
4,LightGBM_BAG_L1,0.095149,f1,0.530352,69.804777,0.530352,69.804777,1,True,1
5,LightGBM_BAG_L2,0.09005,f1,1.169547,326.124204,0.406666,63.082155,2,True,5
6,CatBoost_BAG_L1,0.024088,f1,0.040181,142.472599,0.040181,142.472599,1,True,2
7,CatBoost_BAG_L2,0.008336,f1,0.801175,345.671312,0.038294,82.629263,2,True,6


In [9]:
  # MonthDay_HourMinute format
timestamp = datetime.now().strftime("%m%d_%H%M")

# 4. Predict on the test set
test_df = pd.read_csv('../Data/processed/0427_01/test_2025.csv')
predictions = predictor.predict(test_df)

# 5. Save predictions to CSV
submission = pd.DataFrame({
    "claim_number": test_id,  # Important: use the original claim_number
    "fraud": predictions                      # Your predicted fraud labels (0 or 1)
})
submission.to_csv(f"../Submit/submissions/submission_{timestamp}.csv", index=False)

In [7]:
importances = predictor.feature_importance(data=train_df, subsample_size=2000, num_shuffle_sets=3)

Computing feature importance via permutation shuffling for 47 features using 2000 rows with 3 shuffle sets...
	116.24s	= Expected runtime (38.75s per shuffle set)
	40.75s	= Actual runtime (Completed 3 of 3 shuffle sets)
