In [4]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd


In [5]:
from src.data_preprocessing import full_preprocessing_pipeline


In [6]:
df = pd.read_csv("../data/home_credit/application_train_working.csv")
df.shape


(307511, 122)

In [7]:
missing_percent = df.isnull().mean()
missing_percent.sort_values(ascending=False).head(10)
cols_to_drop = missing_percent[missing_percent > 0.65].index
len(cols_to_drop)
df = df.drop(columns=cols_to_drop)
df.shape


(307511, 105)

In [8]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

df[num_cols] = df[num_cols].fillna(df[num_cols].median())
cat_cols = df.select_dtypes(include=['object']).columns

df[cat_cols] = df[cat_cols].fillna("Unknown")
df.isnull().sum().sum()


0

In [9]:
financial_cols = [
    "AMT_INCOME_TOTAL",
    "AMT_CREDIT",
    "AMT_ANNUITY"
]

for col in financial_cols:
    df[col] = np.log1p(df[col])
df[financial_cols].describe()


Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY
count,307511.0,307511.0,307511.0
mean,11.909245,13.070108,10.067677
std,0.488906,0.715193,0.545872
min,10.152338,10.71444,7.388019
25%,11.630717,12.506181,9.71263
50%,11.899215,13.149068,10.122784
75%,12.2185,13.603123,10.451522
max,18.577685,15.214228,12.460818


In [10]:
df["CREDIT_TO_INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]


  df["CREDIT_TO_INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]


In [11]:
df["ANNUITY_TO_INCOME"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]


  df["ANNUITY_TO_INCOME"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]


In [12]:
df["AGE_YEARS"] = df["DAYS_BIRTH"] / -365


  df["AGE_YEARS"] = df["DAYS_BIRTH"] / -365


In [13]:
df["EMPLOYMENT_YEARS"] = df["DAYS_EMPLOYED"] / -365


  df["EMPLOYMENT_YEARS"] = df["DAYS_EMPLOYED"] / -365


In [14]:
df[["CREDIT_TO_INCOME", "ANNUITY_TO_INCOME", "AGE_YEARS"]].head()


Unnamed: 0,CREDIT_TO_INCOME,ANNUITY_TO_INCOME,AGE_YEARS
0,1.057051,0.827812,25.920548
1,1.125273,0.838217,45.931507
2,1.062333,0.792943,52.180822
3,1.0711,0.871789,52.068493
4,1.123027,0.853518,54.608219


In [15]:
df = pd.get_dummies(df, drop_first=True)
df.shape


(307511, 220)

In [16]:
X = df.drop("TARGET", axis=1)
y = df["TARGET"]

X.shape, y.shape


((307511, 219), (307511,))

In [30]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

feature_names = list(X.columns)

joblib.dump(feature_names, "../models/feature_names.pkl")

print("Saved", len(feature_names), "feature names.")


Saved 219 feature names.


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled.shape


(307511, 219)

In [18]:
np.save("../data/home_credit/X_clean.npy", X_scaled)
np.save("../data/home_credit/y_clean.npy", y.values)


In [19]:
import joblib
joblib.dump(scaler, "../models/standard_scaler.joblib")


['../models/standard_scaler.joblib']

In [20]:
import src.data_preprocessing as dp

print(dir(dp))


['StandardScaler', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'encode_features', 'engineer_features', 'full_preprocessing_pipeline', 'impute_missing', 'joblib', 'log_transform', 'np', 'os', 'pd', 'remove_high_missing', 'scale_features']


In [21]:
import os
print(os.getcwd())


/teamspace/studios/this_studio/notebooks


In [22]:
# Create data_splitting.py inside src folder

file_path = "../src/data_splitting.py"

code = """
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

def stratified_split(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )
    return X_train, X_test, y_train, y_test

def apply_smote(X_train, y_train, random_state=42):
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    return X_resampled, y_resampled
"""

with open(file_path, "w") as f:
    f.write(code)

print("data_splitting.py created successfully.")


data_splitting.py created successfully.


In [23]:
!pip install imbalanced-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m26.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from src.data_splitting import stratified_split, apply_smote
import numpy as np


In [25]:
X_scaled, y = full_preprocessing_pipeline(
    "../data/home_credit/application_train_working.csv"
)

print("X shape:", X_scaled.shape)
print("y shape:", y.shape)


X shape: (307511, 219)
y shape: (307511,)


In [26]:
from src.data_splitting import stratified_split, apply_smote


In [27]:
X_train, X_test, y_train, y_test = stratified_split(X_scaled, y)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

print("\nTrain distribution:")
print(np.bincount(y_train))

print("\nTest distribution:")
print(np.bincount(y_test))


Train shape: (246008, 219)
Test shape: (61503, 219)

Train distribution:
[226148  19860]

Test distribution:
[56538  4965]


In [28]:
X_train_bal, y_train_bal = apply_smote(X_train, y_train)

print("After SMOTE distribution:")
print(np.bincount(y_train_bal))

print("Balanced train shape:", X_train_bal.shape)


After SMOTE distribution:
[226148 226148]
Balanced train shape: (452296, 219)
