# Load the dataset

In [13]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split


# Add the root folder to python path
sys.path.append(os.path.abspath("../"))
# Create directory if it doesn't exist
os.makedirs("data/processed", exist_ok=True)

from src.data_loader import load_data
from src.config import NUM_FEATURES, CAT_FEATURES, TARGET, TEST_SIZE, RANDON_STATE
from src.preprocessing import build_preprocessor

loan_file_path = "../data/raw/Loan_Default.csv"
loan_data = load_data(loan_file_path)

loan_data.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


# Separate features and target

In [14]:
# Feature
X = loan_data[NUM_FEATURES + CAT_FEATURES]
y = loan_data[TARGET]

# Train/Test split 

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDON_STATE,
    stratify=y
)

# Fit & Transform

## Fit

In [16]:
preprocessor = build_preprocessor(NUM_FEATURES,CAT_FEATURES)
preprocessor.fit(X_train)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


## Transform

In [17]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# convert back to DataFrame

In [18]:
cat_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(CAT_FEATURES)
columns = NUM_FEATURES + list(cat_columns)

# Convert to DataFrame
X_train_processed_df = pd.DataFrame(X_train_processed, columns=columns)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=columns)

# Saved processed dataset

In [19]:
X_train_processed_df.to_csv("data/processed/X_train_processed.csv", index=False)
X_test_processed_df.to_csv("data/processed/X_test_processed.csv", index=False)
y_train.to_csv("data/processed/y_train.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)