# Week 9 â€” Day 4: Train/Test Split + Leakage Safety

In [1]:
# Imports + load (Code cell)
import pandas as pd
import numpy as np
from pathlib import Path

DATA_PATH = Path("..") / "data" / "raw" / "creditcard.csv"
df = pd.read_csv(DATA_PATH)

df.shape

(284807, 31)

### Train/Test split (stratified)

In [2]:
# define features(X) and targets(Y)
X = df.drop(columns=["Class"])
y = df["Class"]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (227845, 30) (227845,)
Test shape: (56962, 30) (56962,)


In [5]:
# verifying the fraud percentage
def fraud_pct(y_series):
    return y_series.mean() * 100

print("Overall fraud %:", fraud_pct(y))
print("Train fraud %:", fraud_pct(y_train))
print("Test fraud %:", fraud_pct(y_test))

Overall fraud %: 0.1727485630620034
Train fraud %: 0.17292457591783889
Test fraud %: 0.17204452090867595


### Leakage Safety

In [6]:
# baseline pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

preprocess = Pipeline(steps=[
    ("scaler", StandardScaler())
])

In [7]:
# Fit on train, transform train and test (safe)
X_train_scaled = preprocess.fit_transform(X_train)
X_test_scaled = preprocess.transform(X_test)

print("Scaled train shape:", X_train_scaled.shape)
print("Scaled test shape:", X_test_scaled.shape)

Scaled train shape: (227845, 30)
Scaled test shape: (56962, 30)


In [8]:
# saving the pipeline and split for later
import joblib
from pathlib import Path

ARTIFACTS_DIR = Path("..") / "models"
ARTIFACTS_DIR.mkdir(exist_ok=True)

joblib.dump(preprocess, ARTIFACTS_DIR / "preprocess_v1.joblib")

joblib.dump((X_train, X_test, y_train, y_test), ARTIFACTS_DIR / "split_v1.joblib")

print("Saved: preprocess_v1.joblib and split_v1.joblib")

Saved: preprocess_v1.joblib and split_v1.joblib
