# Prepare creditcard.csv for Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib

## 1. Load creditcard.csv

In [2]:
df = pd.read_csv('../data/raw/creditcard.csv')

print(df.head())
print(df.info())
print(df['Class'].value_counts(normalize=True))

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

## 2. Separate Features & Target

In [3]:
X = df.drop('Class', axis=1)
y = df['Class']

## 3. Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (199364, 30), Test shape: (85443, 30)


## 4. Scale Numeric Features

In [5]:
# For creditcard.csv, all V1–V28 + Amount are numeric
scaler = StandardScaler()

# Fit only on training set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Handle Class Imbalance (SMOTE)

In [7]:
# Suppress joblib loky UserWarning about physical cores
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="Could not find the number of physical cores*")
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

print(f"Train shape after SMOTE: {X_train_res.shape}, {y_train_res.shape}")

Train shape after SMOTE: (398040, 30), (398040,)


## 6. Save Processed Data

In [8]:
# Make DataFrames
X_train_res_df = pd.DataFrame(X_train_res)
X_test_scaled_df = pd.DataFrame(X_test_scaled)

y_train_res_df = pd.DataFrame(y_train_res, columns=['Class']).reset_index(drop=True)
y_test_df = pd.DataFrame(y_test, columns=['Class']).reset_index(drop=True)

# Combine X and y for easy saving
train_ready = pd.concat([X_train_res_df, y_train_res_df], axis=1)
test_ready = pd.concat([X_test_scaled_df, y_test_df], axis=1)

# Save
train_ready.to_csv("../data/processed/creditcard_train_ready.csv", index=False)
test_ready.to_csv("../data/processed/creditcard_test_ready.csv", index=False)