In [1]:
# ===== Install =====
!pip -q install -U pip
!pip -q install -U autogluon.tabular==1.4.0


In [2]:
# ===== Load & Merge (sampled) =====
import pandas as pd, numpy as np
from autogluon.tabular import TabularPredictor

INPUT = "/kaggle/input/ieee-fraud-detection"

print("Loading...")
train_trans = pd.read_csv(f"{INPUT}/train_transaction.csv")
test_trans  = pd.read_csv(f"{INPUT}/test_transaction.csv")
train_id    = pd.read_csv(f"{INPUT}/train_identity.csv")
test_id     = pd.read_csv(f"{INPUT}/test_identity.csv")

# Merge identity info
train = train_trans.merge(train_id, how="left", on="TransactionID")
test  = test_trans.merge(test_id,  how="left", on="TransactionID")

print("Shapes before sampling:", train.shape, test.shape)

# === Reduce size ===
# 1) Use 10% of training (random sample)
train = train.sample(frac=0.1, random_state=42)

# 2) Drop high-NA columns (>95%)
na_thresh = 0.95
drop_cols = [c for c in train.columns if c != "isFraud" and train[c].isna().mean() > na_thresh]
train = train.drop(columns=drop_cols)
test  = test.drop(columns=[c for c in drop_cols if c in test.columns])

# 3) Fill remaining NaNs with -999 (simple, fast)
train = train.fillna(-999)
test  = test.fillna(-999)

label = "isFraud"
print("Train shape after sampling & cleanup:", train.shape)
print("Positive rate:", train[label].mean().round(4))


Loading...
Shapes before sampling: (590540, 434) (506691, 433)
Train shape after sampling & cleanup: (59054, 425)
Positive rate: 0.0357


In [3]:
# ===== Train (fast preset, CPU only) =====
predictor = TabularPredictor(label=label, problem_type="binary", path="ag_ieee_small")

predictor.fit(
    train_data=train,
    time_limit=480,  # 8 minutes
    presets="medium_quality_faster_train",  # lighter preset
    num_bag_folds=0,
    num_stack_levels=0,
    verbosity=2
)

print("\n=== Leaderboard ===")
leader = predictor.leaderboard(silent=True)
print(leader.head(10))


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024
CPU Count:          4
Memory Avail:       23.41 GB / 31.35 GB (74.7%)
Disk Space Avail:   17.74 GB / 19.52 GB (90.9%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 480s
AutoGluon will save models to "/kaggle/working/ag_ieee_small"
Train Data Rows:    59054
Train Data Columns: 424
Label Column:       isFraud
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    24035.22 MB
	Train Data (Original)  Memory Usage: 


=== Leaderboard ===
                 model  score_val eval_metric  pred_time_val    fit_time  \
0        LightGBMLarge     0.9780    accuracy       0.051179   20.344257   
1  WeightedEnsemble_L2     0.9780    accuracy       0.052334   20.489179   
2             CatBoost     0.9748    accuracy       0.027523   35.548538   
3              XGBoost     0.9744    accuracy       0.073711   10.597123   
4             LightGBM     0.9740    accuracy       0.019649    8.131025   
5       NeuralNetTorch     0.9736    accuracy       0.320346  129.467934   
6           LightGBMXT     0.9724    accuracy       0.014170   12.547744   
7      NeuralNetFastAI     0.9724    accuracy       0.087279  114.459386   
8     RandomForestGini     0.9720    accuracy       0.155100   41.313329   
9     RandomForestEntr     0.9716    accuracy       0.144364   31.411101   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                0.051179          20.344257            1       True  

In [7]:
# ===== Align test columns and Predict =====
# works for all AG >= 1.0

train_cols = predictor.feature_metadata_in.get_features()  # returns list of training feature names

# ensure the test set matches the same feature space
missing = [c for c in train_cols if c not in test.columns]
for c in missing:
    test[c] = -999
extra = [c for c in test.columns if c not in train_cols + ["TransactionID"]]
test = test.drop(columns=extra, errors="ignore")
test = test[train_cols]  # reorder columns

print(f"Aligned: kept {len(train_cols)}, added {len(missing)}, dropped {len(extra)}")

# predict probabilities for the positive class
pred_proba = predictor.predict_proba(test)[1]

# build submission
sub = pd.DataFrame({
    "TransactionID": test_id["TransactionID"],
    "isFraud": pred_proba
})
sub.to_csv("/kaggle/working/submission.csv", index=False)
print("✅ Saved: /kaggle/working/submission.csv")


Aligned: kept 421, added 29, dropped 41


  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature] = self.normalize_timeseries(X, datetime_feature, is_fit=is_fit)
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature] = self.normalize_timeseries(X, datetime_feature, is_fit=is_fit)
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature + "." + feature] = getattr(
  X_datetime[datetime_feature] = self.normalize_timeseries(X, datetime_feature, is_fit=is_fit)
  X_datetime[dateti

✅ Saved: /kaggle/working/submission.csv
