**Importing Libraries**

In [2]:
import pandas as pd 
import numpy as np
from zipfile import ZipFile
import joblib

# Sklearn libraries
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, power_transform
from sklearn.ensemble import RandomForestClassifier

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

Libraries loaded successfully!


**Loading the dataset**

In [3]:
zip_path = r"C:\Users\Welcome Sir\Downloads\playground-series-s5e11.zip"

with ZipFile(zip_path, "r") as zip_ref:
    zip_ref.printdir()

File Name                                             Modified             Size
sample_submission.csv                          2025-10-28 23:08:48      2291139
test.csv                                       2025-10-28 23:08:48     23021430
train.csv                                      2025-10-28 23:08:50     55988519


In [4]:
# Loading the "test" and "train" data

with ZipFile(zip_path) as z:
    with z.open('train.csv') as f:
        train_data = pd.read_csv(f)
        print("\nTrain data successfully loaded...")
        
    with z.open('test.csv') as f:
        test_data = pd.read_csv(f)
        print("\nTest data successfully loaded...")


Train data successfully loaded...

Test data successfully loaded...


In [5]:
# Making a copy of the data
train_df = train_data.copy()
test_df = test_data.copy()

**From the EDA, `Power transformation` from recommended**

In [6]:
# Let's split our data
X = train_df.drop(columns=['loan_paid_back', 'id'])
y = train_df['loan_paid_back']

# Let's do the same to the test data
test_df = test_df.drop(columns='id')

In [7]:
# Let's reduce the skewness of the numerical columns using power transformation
def handle_skewness(df, columns:dict):
    df = df.copy()
    df[columns] = power_transform(df[columns], method='yeo-johnson')
    return df

# Let's treat the skewness of numerical in X
num_col = X.select_dtypes(include='number').columns
X = handle_skewness(X, num_col)


# Let's do same for the test data as well.
test_num_col = test_df.select_dtypes(include='number').columns
test_df = handle_skewness(test_df, test_num_col)

**Encoding Categorical Data**

In [8]:
cat_col = train_df.select_dtypes(include='object').columns
cat_col

Index(['gender', 'marital_status', 'education_level', 'employment_status',
       'loan_purpose', 'grade_subgrade'],
      dtype='object')