In [1]:
# 1. Import library
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings("ignore", message=".*load_learner.*insecure pickle.*")

# 2. Load the data
train_df = pd.read_csv('../Data/processed/0427_01/train_2025.csv') 
test_df = pd.read_csv('../Data/processed/0427_01/test_2025.csv') 

train_df.drop(columns="claim_number", inplace=True)
test_id = test_df['claim_number']
test_df.drop(columns=["claim_number"], inplace=True)

In [None]:
timestamp = datetime.now().strftime("%m%d_%H%M")
# predictor = TabularPredictor(
#     label="fraud"
# ).fit(
#     train_data=train_df,
#     holdout_frac=0.2,
#     presets="best",
#     verbosity=2,
#     excluded_model_types=["NN_TORCH", "KNN", "CATBOOST"]
# )


predictor = TabularPredictor(
    label="fraud",
    eval_metric="f1",
    problem_type="binary",
    path=f"../AutogluonModels/Model_{timestamp}"
).fit(
    train_data=train_df,
    presets="best",  # Or use "high_quality_fast_inference_only_refit" if you want lighter models
    holdout_frac=0.2,
    hyperparameters={
        'GBM': {  # GBM = all gradient boosted models (LightGBM, CatBoost, XGBoost)
            'ag_args_fit': {'num_gpus': 1}
        }
    },
    included_model_types=["GBM"],  # Only include GBMs
    verbosity=2
)


No path specified. Models will be saved in: "AutogluonModels\ag-20250428_185003"
Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.12
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          16
Memory Avail:       15.34 GB / 31.93 GB (48.0%)
Disk Space Avail:   285.19 GB / 935.97 GB (30.5%)
Presets specified: ['best']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout val

KeyboardInterrupt: 

[36m(_ray_fit pid=32660)[0m No improvement since epoch 2: early stopping


In [8]:
temp = predictor.leaderboard()
temp

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.372626,f1,12.431560,497.427063,0.004000,2.927385,3,True,128
1,NeuralNetFastAI_r143_BAG_L2,0.366606,f1,10.817639,375.918469,0.200080,10.570300,2,True,121
2,NeuralNetFastAI_r102_BAG_L2,0.363663,f1,10.852048,374.476654,0.234489,9.128484,2,True,107
3,NeuralNetFastAI_r156_BAG_L2,0.362118,f1,10.837050,376.610555,0.219491,11.262386,2,True,123
4,NeuralNetFastAI_BAG_L2,0.361727,f1,10.985593,382.348483,0.368034,17.000314,2,True,96
...,...,...,...,...,...,...,...,...,...,...
123,ExtraTreesEntr_BAG_L1,0.000000,f1,0.587954,0.699074,0.587954,0.699074,1,True,7
124,ExtraTrees_r126_BAG_L1,0.000000,f1,0.758054,1.026744,0.758054,1.026744,1,True,80
125,XGBoost_r31_BAG_L1,0.000000,f1,0.803297,2.912572,0.803297,2.912572,1,True,62
126,LightGBM_r96_BAG_L2,0.000000,f1,10.662560,367.158300,0.045001,1.810131,2,True,103


In [9]:
  # MonthDay_HourMinute format
timestamp = datetime.now().strftime("%m%d_%H%M")

# 4. Predict on the test set
test_df = pd.read_csv('../Data/processed/0427_01/test_2025.csv')
predictions = predictor.predict(test_df)

# 5. Save predictions to CSV
submission = pd.DataFrame({
    "claim_number": test_id,  # Important: use the original claim_number
    "fraud": predictions                      # Your predicted fraud labels (0 or 1)
})
submission.to_csv(f"../Submit/submissions/submission_{timestamp}.csv", index=False)

In [6]:
importances = predictor.feature_importance(data=train_df, subsample_size=1000, num_shuffle_sets=2)

Computing feature importance via permutation shuffling for 47 features using 1000 rows with 2 shuffle sets...
	462.5s	= Expected runtime (231.25s per shuffle set)
	99.2s	= Actual runtime (Completed 2 of 2 shuffle sets)
