# 1. Setup & installs

In [6]:
# Colab: install extra libraries
!pip install -q imbalanced-learn xgboost fastapi uvicorn[standard] joblib


In [7]:
# Core libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

import joblib


# 2. Load dataset

In [13]:
from google.colab import files
import zipfile, os, glob
import pandas as pd

# 1) Upload the zip file (choose dataset.zip)
uploaded = files.upload()

# 2) Get the name of the uploaded zip
zip_name = next(iter(uploaded))   # e.g. "dataset.zip"
print("Uploaded file:", zip_name)

# 3) Extract it into /content/data
extract_dir = "/content/data"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_name, "r") as z:
    z.extractall(extract_dir)

print("\nExtracted contents:\n")
for root, dirs, files_in_dir in os.walk(extract_dir):
    print(root)
    for f in files_in_dir:
        print("   ", f)

# 4) Find all .pkl files (your fraud data is stored as daily .pkl files)
pkl_files = sorted(glob.glob(os.path.join(extract_dir, "**", "*.pkl"),
                             recursive=True))

if len(pkl_files) == 0:
    raise ValueError("No .pkl files found inside the uploaded zip. Check the zip structure.")
else:
    print(f"\nFound {len(pkl_files)} pickle files. Example:")
    print(pkl_files[:5])

# 5) Load and concatenate all .pkl files into a single DataFrame
dfs = [pd.read_pickle(p) for p in pkl_files]
df = pd.concat(dfs, ignore_index=True)

print("\nFinal DataFrame:")
print("Shape:", df.shape)
print(df.head())
print(df.columns)


Saving dataset.zip to dataset (4).zip
Uploaded file: dataset (4).zip

Extracted contents:

/content/data
/content/data/data
    2018-09-28.pkl
    2018-05-24.pkl
    2018-07-16.pkl
    2018-06-16.pkl
    2018-07-11.pkl
    2018-09-30.pkl
    2018-08-01.pkl
    2018-06-20.pkl
    2018-05-21.pkl
    2018-04-15.pkl
    2018-08-22.pkl
    2018-07-24.pkl
    2018-06-30.pkl
    2018-08-16.pkl
    2018-07-15.pkl
    2018-08-29.pkl
    2018-07-21.pkl
    2018-04-18.pkl
    2018-07-01.pkl
    2018-08-02.pkl
    2018-05-04.pkl
    2018-09-23.pkl
    2018-09-10.pkl
    2018-04-17.pkl
    2018-05-07.pkl
    2018-07-13.pkl
    2018-08-20.pkl
    2018-08-28.pkl
    2018-07-26.pkl
    2018-05-01.pkl
    2018-04-05.pkl
    2018-04-29.pkl
    2018-08-12.pkl
    2018-06-19.pkl
    2018-08-06.pkl
    2018-05-15.pkl
    2018-05-20.pkl
    2018-06-14.pkl
    2018-04-08.pkl
    2018-09-05.pkl
    2018-08-25.pkl
    2018-05-05.pkl
    2018-06-08.pkl
    2018-04-13.pkl
    2018-04-21.pkl
    2018-04-01.pkl
  

# 3. Parse datetime & sort + basic time-based split

In [14]:
# Parse TX_DATETIME and sort
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
df = df.sort_values('TX_DATETIME').reset_index(drop=True)

# Choose last 30 days as test set (you can tweak this)
max_date = df['TX_DATETIME'].max().normalize()
split_date = max_date - pd.Timedelta(days=30)

train_df = df[df['TX_DATETIME'] < split_date].copy()
test_df  = df[df['TX_DATETIME'] >= split_date].copy()

print("Train shape:", train_df.shape, "Test shape:", test_df.shape)
print("Train fraud rate:", train_df['TX_FRAUD'].mean())
print("Test fraud rate:", test_df['TX_FRAUD'].mean())


Train shape: (1456778, 9) Test shape: (297377, 9)
Train fraud rate: 0.008268246774731633
Test fraud rate: 0.008864169051406127


# 4. Feature engineering – velocity & aggregates
We’ll add:



1.   Transaction hour, day of week

2.  Customer 1-day transaction count & total amount

3.   Terminal 1-day transaction count



In [16]:

# 1) Add simple time features (hour, day-of-week, day)
def add_time_features(data):
    data['TX_HOUR'] = data['TX_DATETIME'].dt.hour
    data['TX_DOW']  = data['TX_DATETIME'].dt.dayofweek
    data['TX_DAY']  = data['TX_DATETIME'].dt.date
    return data

train_df = add_time_features(train_df)
test_df  = add_time_features(test_df)


# 2) Add velocity features for a given key (CUSTOMER_ID / TERMINAL_ID)
def add_velocity_features(data, key_col, amount_col='TX_AMOUNT', window='1D'):
    """
    For each key (customer/terminal), compute in the past 'window':
    - number of past transactions (key_col_1d_txn_count)
    - total past transaction amount (key_col_1d_txn_amount)

    Uses groupby().rolling(on='TX_DATETIME') to avoid index issues.
    """
    # Sort by time so rolling is chronological
    data = data.sort_values(['TX_DATETIME'])

    # Rolling window per key, using TX_DATETIME as time column
    grouped = (
        data
        .groupby(key_col)
        .rolling(window=window, on='TX_DATETIME')[amount_col]
    )

    # grouped.count() / grouped.sum() include current txn -> subtract current
    count_roll = grouped.count().values - 1
    sum_roll   = grouped.sum().values - data[amount_col].values

    # Assign back (order matches data after .values)
    data[f'{key_col}_1d_txn_count']  = count_roll
    data[f'{key_col}_1d_txn_amount'] = sum_roll

    # No past history -> 0
    data[f'{key_col}_1d_txn_count']  = data[f'{key_col}_1d_txn_count'].fillna(0)
    data[f'{key_col}_1d_txn_amount'] = data[f'{key_col}_1d_txn_amount'].fillna(0)

    return data


# 3) Apply velocity features first on train_df (for inspection, if needed)
train_df = add_velocity_features(train_df, 'CUSTOMER_ID')
train_df = add_velocity_features(train_df, 'TERMINAL_ID')

# 4) Recompute velocity features on full data to avoid leakage,
#    then re-split into train and test
full_df = pd.concat([train_df, test_df], axis=0)
full_df = full_df.sort_values('TX_DATETIME').reset_index(drop=True)

full_df = add_velocity_features(full_df, 'CUSTOMER_ID')
full_df = add_velocity_features(full_df, 'TERMINAL_ID')

# 5) Final train/test after adding all features
train_df = full_df[full_df['TX_DATETIME'] < split_date].copy()
test_df  = full_df[full_df['TX_DATETIME'] >= split_date].copy()

print("Train shape with features:", train_df.shape)
print("Test shape with features:", test_df.shape)


Train shape with features: (1456778, 16)
Test shape with features: (297377, 16)


# 5. Prepare features & encoders
We’ll label-encode IDs and scale numeric features.

In [19]:
# Target
y_train = train_df['TX_FRAUD']
y_test  = test_df['TX_FRAUD']

# Feature columns
numeric_cols = [
    'TX_AMOUNT', 'TX_HOUR', 'TX_DOW',
    'CUSTOMER_ID_1d_txn_count', 'CUSTOMER_ID_1d_txn_amount',
    'TERMINAL_ID_1d_txn_count', 'TERMINAL_ID_1d_txn_amount'
]

# Fit LabelEncoders on ALL IDs (train + test) to avoid "unseen label" errors
cust_le = LabelEncoder()
term_le = LabelEncoder()

all_cust_ids = pd.concat([train_df['CUSTOMER_ID'], test_df['CUSTOMER_ID']])
all_term_ids = pd.concat([train_df['TERMINAL_ID'], test_df['TERMINAL_ID']])

cust_le.fit(all_cust_ids)
term_le.fit(all_term_ids)

train_df['CUSTOMER_ID_ENC'] = cust_le.transform(train_df['CUSTOMER_ID'])
test_df['CUSTOMER_ID_ENC']  = cust_le.transform(test_df['CUSTOMER_ID'])

train_df['TERMINAL_ID_ENC'] = term_le.transform(train_df['TERMINAL_ID'])
test_df['TERMINAL_ID_ENC']  = term_le.transform(test_df['TERMINAL_ID'])

categorical_enc_cols = ['CUSTOMER_ID_ENC', 'TERMINAL_ID_ENC']

X_train = train_df[categorical_enc_cols + numeric_cols]
X_test  = test_df[categorical_enc_cols + numeric_cols]

# Preprocessor: scale numeric, pass IDs through
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', 'passthrough', categorical_enc_cols),
    ]
)

# 6. Helper to train & evaluate models

In [21]:
from sklearn.metrics import roc_curve
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred  = (y_proba >= 0.5).astype(int)

    auc = roc_auc_score(y_test, y_proba)
    print(f"\n==== {name} ====")
    print("ROC-AUC:", round(auc, 4))
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    return auc

# 7. Models with imbalance handling
7.1 Logistic Regression with class weights

In [22]:
log_reg_clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(
        max_iter=2000,
        class_weight='balanced',  # handles imbalance
        n_jobs=-1
    ))
])

auc_logreg = evaluate_model("Logistic Regression (class_weight=balanced)",
                            log_reg_clf, X_train, y_train, X_test, y_test)



==== Logistic Regression (class_weight=balanced) ====
ROC-AUC: 0.6491

Classification report:
              precision    recall  f1-score   support

           0     0.9937    0.7560    0.8587    294741
           1     0.0168    0.4651    0.0324      2636

    accuracy                         0.7535    297377
   macro avg     0.5052    0.6106    0.4455    297377
weighted avg     0.9851    0.7535    0.8514    297377

Confusion matrix:
 [[222835  71906]
 [  1410   1226]]



7.2 RandomForest + undersampling

In [23]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    class_weight=None,  # we'll use undersampling instead
    random_state=42
)

rf_pipeline = ImbPipeline(steps=[
    ('preprocess', preprocessor),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('clf', rf)
])

auc_rf = evaluate_model("RandomForest + RandomUnderSampler",
                        rf_pipeline, X_train, y_train, X_test, y_test)



==== RandomForest + RandomUnderSampler ====
ROC-AUC: 0.6327

Classification report:
              precision    recall  f1-score   support

           0     0.9934    0.7891    0.8795    294741
           1     0.0173    0.4162    0.0333      2636

    accuracy                         0.7858    297377
   macro avg     0.5054    0.6026    0.4564    297377
weighted avg     0.9848    0.7858    0.8720    297377

Confusion matrix:
 [[232572  62169]
 [  1539   1097]]



7.3 XGBoost + SMOTE

In [24]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='auc',
    n_jobs=-1,
    random_state=42
)

xgb_pipeline = ImbPipeline(steps=[
    ('preprocess', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('clf', xgb)
])

auc_xgb = evaluate_model("XGBoost + SMOTE",
                         xgb_pipeline, X_train, y_train, X_test, y_test)

print("\nAUC summary:")
print("LogReg:", auc_logreg)
print("RF    :", auc_rf)
print("XGB   :", auc_xgb)



==== XGBoost + SMOTE ====
ROC-AUC: 0.6425

Classification report:
              precision    recall  f1-score   support

           0     0.9930    0.9999    0.9965    294741
           1     0.9686    0.2105    0.3459      2636

    accuracy                         0.9929    297377
   macro avg     0.9808    0.6052    0.6712    297377
weighted avg     0.9928    0.9929    0.9907    297377

Confusion matrix:
 [[294723     18]
 [  2081    555]]

AUC summary:
LogReg: 0.6490699179685131
RF    : 0.6327111990698204
XGB   : 0.6425251393138203


# 8. Save the best model & encoders

In [25]:
BEST_MODEL_PATH = "/content/fraud_model_xgb.joblib"
ID_ENCODERS_PATH = "/content/id_encoders.joblib"

joblib.dump(xgb_pipeline, BEST_MODEL_PATH)
joblib.dump(
    {
        "cust_le": cust_le,
        "term_le": term_le
    },
    ID_ENCODERS_PATH
)

print("Saved model to:", BEST_MODEL_PATH)
print("Saved encoders to:", ID_ENCODERS_PATH)


Saved model to: /content/fraud_model_xgb.joblib
Saved encoders to: /content/id_encoders.joblib


# Export model from Google Colab

In [26]:
from google.colab import files

files.download('/content/fraud_model_xgb.joblib')
files.download('/content/id_encoders.joblib')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>