### Data Preprocessing

**Step 1: Load the Cleaned Dataset**
In this step, we'll load the cleaned dataset from the "cleaned_dataset.csv"

In [1]:
import pandas as pd
data = pd.read_csv("../data/cleaned_dataset.csv")

Step 2: Split Data into Features and Target

In [2]:
X = data.drop(columns=['isFraud'])
y = data['isFraud']

Step 3: Train-Test Split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 4: Define Data Preprocessing Steps

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_columns = ['type']
numeric_columns = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']


categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, drop='first'))  # Use drop='first' to prevent multicollinearity
])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

Step 5: Apply Transformers to Columns

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns),
        ('num', numeric_transformer, numeric_columns)
    ])

Step 6: Preprocess the Data and Save as Numpy Arrays

In [6]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


y_train = y_train.values


import numpy as np

np.save("../data/X_train.npy", X_train_preprocessed)
np.save("../data/X_test.npy", X_test_preprocessed)
np.save("../data/y_train.npy", y_train)
np.save("../data/y_test.npy", y_test.values)

