In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import os
os.chdir(os.path.dirname(os.path.abspath(__file__)))

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
### Data
train_df = pd.read_csv("train_transaction.csv")
data = train_df.copy()

In [None]:
### Pre-process
X = data.drop(columns=["isFraud"])
y = data["isFraud"]

In [None]:
## split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=7406
)

In [None]:
## Missing data
#
num_cols = X_train.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy="median")
X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = imputer.transform(X_test[num_cols])

In [None]:
# scale
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
### EDA
sns.set(style="whitegrid", palette="pastel", font_scale=1.1)
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
# fraud vs legit
fraud_counts = y_train.value_counts(normalize=True)
sns.barplot(x=fraud_counts.index, y=fraud_counts.values)
plt.title("Target Distribution: Fraud vs Legit")
plt.xlabel("isFraud")
plt.ylabel("Proportion")
plt.show()

In [None]:
# missing data
missing_ratio = X_train.isnull().mean().sort_values(ascending=False)
top_missing = missing_ratio.head(15)
sns.barplot(x=top_missing.values, y=top_missing.index)
plt.title("Top 15 Features by Missing Ratio")
plt.xlabel("Missing Fraction")
plt.ylabel("Feature")
plt.show()

In [None]:
# corr graph
num_cols = X_train.select_dtypes(include=np.number).columns
corr = X_train[num_cols].corrwith(y_train).dropna().sort_values(key=abs, ascending=False)[:15]
sns.barplot(x=corr.values, y=corr.index)
plt.title("Top 15 Features Most Correlated with isFraud")
plt.xlabel("Correlation with Target")
plt.ylabel("Feature")
plt.show()

## PCA

In [None]:
# Fit PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train[num_cols])

# Explained variance ratio
explained_variance = np.cumsum(pca.explained_variance_ratio_)


In [None]:
# Scree Plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.title('Scree Plot: Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()


In [None]:
# Find the number of components explaining 95% variance
n_components_95 = np.argmax(explained_variance >= 0.95) + 1
print(f"Number of components explaining 95% variance: {n_components_95}")


In [None]:
pca_final = PCA(n_components=n_components_95)
X_train_reduced = pca_final.fit_transform(X_train[num_cols])
X_test_reduced = pca_final.transform(X_test[num_cols])

print(f"Reduced shape: {X_train_reduced.shape}")
