In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from pathlib import Path

In [2]:
data_dir = Path("../../data/processed")
original_csv = data_dir / "imputed_dataset.csv"
augmented_csv = data_dir / "imputed_dataset_with_synthetic.csv"

In [3]:
df = pd.read_csv(original_csv)

# Features and target
features = [
    'credit_limit_used(%)',
    'credit_score',
    'prev_defaults',
    'default_in_last_6months',
    'no_of_children',
    'owns_car',
    'no_of_days_employed',
    'yearly_debt_payments',
    'migrant_worker',
    'total_family_members'
]

In [4]:
df['credit_score_squared'] = df['credit_score'] ** 2
df['credit_limit_used_squared'] = df['credit_limit_used(%)'] ** 2
df['credit_score_x_credit_limit_used'] = df['credit_score'] * df['credit_limit_used(%)']
df['credit_ratio_limit'] = df['credit_limit_used(%)'] / df['credit_score']

features += [
    'credit_score_squared',
    'credit_limit_used_squared',
    'credit_score_x_credit_limit_used',
    'credit_ratio_limit'
]

target = 'credit_card_default'

In [5]:
# Identify first-time defaulters (prev_defaults == 0)
first_time_mask = (df[target] == 1) & (df['prev_defaults'] == 0)
first_time_df = df[first_time_mask]

# Non-defaulters remain unchanged
non_default_df = df[df[target] == 0]

In [6]:
# Prepare dataset for SMOTE: only first-time defaulters vs. non-defaulters
smote_df = pd.concat([non_default_df, first_time_df], ignore_index=True)
X_smote = smote_df[features]
y_smote = smote_df[target]

In [7]:
smote = SMOTE(
    sampling_strategy=0.5,  # adjust ratio of synthetic first-time defaults to non-defaults
    random_state=42
)
X_res, y_res = smote.fit_resample(X_smote, y_smote)

# Only take the synthetic first-time defaults
n_synthetic = X_res.shape[0] - X_smote.shape[0]
synthetic_X = X_res[-n_synthetic:]
synthetic_y = y_res[-n_synthetic:]

In [8]:
augmented_df = pd.concat([
    df,  # original dataset (includes repeat defaulters)
    pd.DataFrame(synthetic_X, columns=features).assign(**{target: synthetic_y})
], ignore_index=True)

augmented_df.head()

Unnamed: 0,age,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default,credit_score_squared,credit_limit_used_squared,credit_score_x_credit_limit_used,credit_ratio_limit
0,46.0,0.0,1.0,0.0,107934.04,612.0,17.0,1.0,1.0,33070.28,18690.93,73.0,544.0,2.0,1.0,1.0,295936.0,5329.0,39712.0,0.134191
1,29.0,0.0,1.0,0.0,109862.62,2771.0,8.0,2.0,0.0,15329.53,37745.19,52.0,857.0,0.0,0.0,0.0,734449.0,2704.0,44564.0,0.060677
2,37.0,0.0,1.0,0.0,230153.17,204.0,8.0,2.0,0.0,48416.6,41598.36,43.0,650.0,0.0,0.0,0.0,422500.0,1849.0,27950.0,0.066154
3,39.0,0.0,1.0,0.0,122325.82,11941.0,3.0,2.0,0.0,22574.36,32627.76,20.0,754.0,0.0,0.0,0.0,568516.0,400.0,15080.0,0.026525
4,46.0,1.0,1.0,0.0,387286.0,1459.0,3.0,1.0,0.0,38282.95,52950.64,75.0,927.0,0.0,0.0,0.0,859329.0,5625.0,69525.0,0.080906


In [19]:
augmented_df.shape

(65214, 20)

In [21]:
augmented_df.to_csv(augmented_csv, index=False)
print(f"Augmented dataset saved to {augmented_csv}, {n_synthetic} synthetic first-time defaults added.")

Augmented dataset saved to ..\..\data\processed\imputed_dataset_with_synthetic.csv, 19686 synthetic first-time defaults added.
