In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

from pathlib import Path


In [2]:
# Define path
data_path = Path("../data/raw/creditcard.csv")

# Load dataset
df = pd.read_csv(data_path)

# Basic check
print("Shape:", df.shape)
print("\nClass Distribution:")
print(df["Class"].value_counts())


Shape: (284807, 31)

Class Distribution:
Class
0    284315
1       492
Name: count, dtype: int64


In [3]:
X = df.drop("Class", axis=1)
y = df["Class"]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)

print("\nFraud Ratio:")
print(y.value_counts(normalize=True))

Feature shape: (284807, 30)
Target shape: (284807,)

Fraud Ratio:
Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

print("\nTrain fraud ratio:")
print(y_train.value_counts(normalize=True))

print("\nTest fraud ratio:")
print(y_test.value_counts(normalize=True))

Train shape: (227845, 30)
Test shape: (56962, 30)

Train fraud ratio:
Class
0    0.998271
1    0.001729
Name: proportion, dtype: float64

Test fraud ratio:
Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64


In [5]:
X_train = X_train.drop("Time", axis=1)
X_test = X_test.drop("Time", axis=1)

print("Columns after dropping Time:", X_train.columns[:5])

Columns after dropping Time: Index(['V1', 'V2', 'V3', 'V4', 'V5'], dtype='object')


In [6]:
scaler = RobustScaler()

# Fit ONLY on training data
X_train["Amount"] = scaler.fit_transform(X_train[["Amount"]])

# Transform test data
X_test["Amount"] = scaler.transform(X_test[["Amount"]])

In [7]:
print("Final Train Shape:", X_train.shape)
print("Final Test Shape:", X_test.shape)

print("\nSample of scaled Amount:")
print(X_train["Amount"].describe())

Final Train Shape: (227845, 29)
Final Test Shape: (56962, 29)

Sample of scaled Amount:
count    227845.000000
mean          0.921034
std           3.489528
min          -0.306193
25%          -0.227697
50%           0.000000
75%           0.772303
max         357.260404
Name: Amount, dtype: float64


In [8]:
processed_path = Path("../data/processed")
processed_path.mkdir(parents=True, exist_ok=True)

X_train.to_csv(processed_path / "X_train.csv", index=False)
X_test.to_csv(processed_path / "X_test.csv", index=False)
y_train.to_csv(processed_path / "y_train.csv", index=False)
y_test.to_csv(processed_path / "y_test.csv", index=False)

print("Processed files saved successfully.")

Processed files saved successfully.
