In [1]:
#Importing Packages
import import_ipynb
import Required_functions as req
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
application_train_df = pd.read_csv("application_train.csv")
application_test_df = pd.read_csv("application_test.csv")
credit_card_df = pd.read_csv("credit_card_balance.csv")
installment_df = pd.read_csv("installments_payments.csv")
previous_app_df = pd.read_csv("previous_application.csv")
bureau_df = pd.read_csv('bureau.csv')

## Application Train 

In [3]:
application_clean_df = application_train_df.copy()
req.reduce_memory_usage(application_clean_df)
req.factorize_EXT_SOURCE(application_clean_df)
req.bin_CAR_AGE(application_clean_df)
req.fill_OCCUPATION_col(application_clean_df)
req.dropna_over65(application_clean_df)

cat_cols_app = req.factorize_cat_cols(application_clean_df)

req.redundant_data(application_clean_df)
req.imputing_na(application_clean_df)
req.standardize_children_col(application_clean_df)

req.cap_outliers_sd(application_clean_df, exclude_columns = ['CNT_CHILDREN', 'TARGET'])

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%
Factorized Columns: ['EXT_SOURCE_1_Category', 'EXT_SOURCE_2_Category', 'EXT_SOURCE_3_Category']
        OWN_CAR_AGE CAR_AGE_BIN
0               NaN        none
1               NaN        none
2              26.0    very old
3               NaN        none
4               NaN        none
...             ...         ...
307506          NaN        none
307507          NaN        none
307508          NaN        none
307509          NaN        none
307510          NaN        none

[307511 rows x 2 columns]
0             none
1             none
2         very old
3             none
4             none
            ...   
307506        none
307507        none
307508        none
307509        none
307510        none
Name: CAR_AGE_BIN, Length: 307511, dtype: category
Categories (5, object): ['none' < 'very old' < 'quite old' < 'old' < 'new']
There are: 17 columns missing data over 65%
COMMONAREA

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,EXT_SOURCE_1_Category,EXT_SOURCE_2_Category,EXT_SOURCE_3_Category
0,100002,1,0.00000,0,0,0,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0
1,100003,0,0.00000,1,0,1,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0
2,100004,0,0.97574,0,1,0,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
3,100006,0,0.00000,1,0,0,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,100007,0,0.00000,0,0,0,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0.00000,0,0,1,0,157500.0,254700.0,27558.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
307507,456252,0,0.00000,1,0,0,0,72000.0,269550.0,12001.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
307508,456253,0,0.00000,1,0,0,0,153000.0,677664.0,29979.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2,2,0
307509,456254,1,0.00000,1,0,0,0,171000.0,370107.0,20205.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1


## Bureau

In [4]:
bureau_df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [5]:
# Convert CREDIT_ACTIVE to binary indicators
bureau_df['closed_credit'] = (bureau_df['CREDIT_ACTIVE'] == 0).astype(int)
bureau_df['active_credit'] = (bureau_df['CREDIT_ACTIVE'] == 1).astype(int)

# Compute credit duration
bureau_df['credit_duration'] = bureau_df['DAYS_CREDIT_ENDDATE'] - bureau_df['DAYS_CREDIT']

# Aggregate statistics at SK_ID_CURR level
bureau_agg = bureau_df.groupby('SK_ID_CURR').agg(
    closed_credit_number=('closed_credit', 'sum'),
    active_credit_number=('active_credit', 'sum'),
    
    # Median values
    median_DAYS_CREDIT_ENDDATE=('DAYS_CREDIT_ENDDATE', 'median'),
    median_CREDIT_DAY_OVERDUE=('CREDIT_DAY_OVERDUE', 'median'),
    median_DURATION_OF_CREDIT=('credit_duration', 'median'),
    median_AMT_CREDIT_SUM=('AMT_CREDIT_SUM', 'median'),
    median_AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', 'median'),

    # Average values
    avg_DAYS_CREDIT_ENDDATE=('DAYS_CREDIT_ENDDATE', 'mean'),
    avg_CREDIT_DAY_OVERDUE=('CREDIT_DAY_OVERDUE', 'mean'),
    avg_DURATION_OF_CREDIT=('credit_duration', 'mean'),
    avg_AMT_CREDIT_SUM=('AMT_CREDIT_SUM', 'mean'),
    avg_AMT_CREDIT_SUM_DEBT=('AMT_CREDIT_SUM_DEBT', 'mean')
).reset_index()

# Display first 5 rows
bureau_agg.head()

Unnamed: 0,SK_ID_CURR,closed_credit_number,active_credit_number,median_DAYS_CREDIT_ENDDATE,median_CREDIT_DAY_OVERDUE,median_DURATION_OF_CREDIT,median_AMT_CREDIT_SUM,median_AMT_CREDIT_SUM_DEBT,avg_DAYS_CREDIT_ENDDATE,avg_CREDIT_DAY_OVERDUE,avg_DURATION_OF_CREDIT,avg_AMT_CREDIT_SUM,avg_AMT_CREDIT_SUM_DEBT
0,100001,0,0,-179.0,0.0,730.0,168345.0,0.0,82.428571,0.0,817.428571,207623.571429,85240.928571
1,100002,0,0,-424.5,0.0,547.5,54130.5,0.0,-349.0,0.0,719.833333,108131.945625,49156.2
2,100003,0,0,-480.0,0.0,725.5,92576.25,0.0,-544.5,0.0,856.25,254350.125,0.0
3,100004,0,0,-488.5,0.0,378.5,94518.9,0.0,-488.5,0.0,378.5,94518.9,0.0
4,100005,0,0,122.0,0.0,245.0,58500.0,25321.5,439.333333,0.0,630.0,219042.0,189469.5


In [6]:

req.dropna_over65(bureau_agg)
req.factorize_cat_cols(bureau_agg)

req.cap_outliers_sd(bureau_agg, threshold=3, exclude_columns=None)

req.redundant_data(bureau_agg)
req.imputing_na(bureau_agg)

req.cap_outliers_sd(bureau_agg, threshold=3, exclude_columns=None)


There are: 0 columns missing data over 65%
Series([], dtype: float64)


Shape of the df after removing missing data over 65% : (305811, 13)
Dropped 3 redundant columns
The columns names that were dropped are :['avg_AMT_CREDIT_SUM', 'median_DURATION_OF_CREDIT', 'avg_DURATION_OF_CREDIT']


Shape of the dataset after removing multicolinearitly: (305811, 10)
Imputed all na values


Unnamed: 0,SK_ID_CURR,closed_credit_number,active_credit_number,median_DAYS_CREDIT_ENDDATE,median_CREDIT_DAY_OVERDUE,median_AMT_CREDIT_SUM,median_AMT_CREDIT_SUM_DEBT,avg_DAYS_CREDIT_ENDDATE,avg_CREDIT_DAY_OVERDUE,avg_AMT_CREDIT_SUM_DEBT
0,100001,0,0,-179.0,0.0,168345.00,0.000,82.428571,0.0,85240.928571
1,100002,0,0,-424.5,0.0,54130.50,0.000,-349.000000,0.0,49156.200000
2,100003,0,0,-480.0,0.0,92576.25,0.000,-544.500000,0.0,0.000000
3,100004,0,0,-488.5,0.0,94518.90,0.000,-488.500000,0.0,0.000000
4,100005,0,0,122.0,0.0,58500.00,25321.500,439.333333,0.0,189469.500000
...,...,...,...,...,...,...,...,...,...,...
305806,456249,0,0,-1339.0,0.0,248692.50,0.000,-1232.333333,0.0,16307.100000
305807,456250,0,0,1797.0,0.0,483349.50,391731.615,1288.333333,0.0,744013.365000
305808,456253,0,0,99.0,0.0,675000.00,85518.000,280.500000,0.0,448958.250000
305809,456254,0,0,-859.0,0.0,45000.00,0.000,-859.000000,0.0,0.000000


In [7]:
bureau_agg.isnull().sum()

SK_ID_CURR                    0
closed_credit_number          0
active_credit_number          0
median_DAYS_CREDIT_ENDDATE    0
median_CREDIT_DAY_OVERDUE     0
median_AMT_CREDIT_SUM         0
median_AMT_CREDIT_SUM_DEBT    0
avg_DAYS_CREDIT_ENDDATE       0
avg_CREDIT_DAY_OVERDUE        0
avg_AMT_CREDIT_SUM_DEBT       0
dtype: int64

### Bureau Analysis

In [8]:
application_train_df_bureau = application_train_df.copy()

# Perform an inner join on SK_ID_CURR
merged_bureau = application_train_df_bureau.merge(bureau_agg, on='SK_ID_CURR', how='inner')

In [9]:
correlation = merged_bureau[['TARGET',
       'median_DAYS_CREDIT_ENDDATE', 'median_CREDIT_DAY_OVERDUE',
       'median_AMT_CREDIT_SUM', 'median_AMT_CREDIT_SUM_DEBT',
       'avg_DAYS_CREDIT_ENDDATE', 'avg_CREDIT_DAY_OVERDUE',
       'avg_AMT_CREDIT_SUM_DEBT']].corr()

# Keep only the 'TARGET' column, but show it as the first column
correlation_with_target = correlation[['TARGET']].drop(index='TARGET')
print(correlation_with_target)


                              TARGET
median_DAYS_CREDIT_ENDDATE  0.056303
median_CREDIT_DAY_OVERDUE   0.018015
median_AMT_CREDIT_SUM      -0.022451
median_AMT_CREDIT_SUM_DEBT  0.027937
avg_DAYS_CREDIT_ENDDATE     0.052741
avg_CREDIT_DAY_OVERDUE      0.023580
avg_AMT_CREDIT_SUM_DEBT     0.018690


The Bureau predictors have low corelations with Target, therefore the bureau dataset will be discarded.

## Credit Card

In [10]:
credit_card_clean = credit_card_df.copy()
req.reduce_memory_usage(credit_card_clean)
req.dropna_over65(credit_card_clean)

cat_cols_cc = req.factorize_cat_cols(credit_card_clean)

req.redundant_data(credit_card_clean)
req.imputing_na(credit_card_clean)

req.cap_outliers_sd(credit_card_clean)


Memory usage of dataframe is 673.88 MB
Memory usage after optimization is: 289.33 MB
Decreased by 57.1%
There are: 0 columns missing data over 65%
Series([], dtype: float64)


Shape of the df after removing missing data over 65% : (3840312, 23)
Dropped 7 redundant columns
The columns names that were dropped are :['AMT_RECIVABLE', 'AMT_PAYMENT_TOTAL_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_DRAWINGS_CURRENT', 'AMT_TOTAL_RECEIVABLE', 'AMT_INST_MIN_REGULARITY']


Shape of the dataset after removing multicolinearitly: (3840312, 16)
Imputed all na values


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_PAYMENT_CURRENT,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.970001,135000.0,0.000000,0.0,877.5,1800.000000,0.0,1.0,0.0,35.0,0.000000,0.0,0.0
1,2582071,363914,-1,63975.554688,45000.0,2250.000000,0.0,0.0,2250.000000,1.0,1.0,0.0,69.0,0.000000,0.0,0.0
2,1740877,371185,-7,31815.224609,450000.0,0.000000,0.0,0.0,2250.000000,0.0,0.0,0.0,30.0,0.000000,0.0,0.0
3,1389973,337855,-4,236572.109375,225000.0,2250.000000,0.0,0.0,11925.000000,1.0,1.0,0.0,10.0,0.000000,0.0,0.0
4,1891521,126868,-1,377221.250000,450000.0,0.000000,0.0,11547.0,27000.000000,0.0,1.0,0.0,101.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840307,1036507,328243,-9,0.000000,45000.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3840308,1714892,347207,-9,0.000000,45000.0,0.000000,0.0,0.0,1879.109985,0.0,0.0,0.0,23.0,0.000000,0.0,0.0
3840309,1302323,215757,-9,275784.968750,585000.0,81089.492188,0.0,0.0,105816.281250,2.0,2.0,0.0,18.0,0.000000,0.0,0.0
3840310,1624872,430337,-10,0.000000,450000.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [11]:
credit_card_clean = credit_card_clean.sort_values('SK_ID_PREV')
# Define aggregation functions with both median and mean
agg_funcs = {
    'SK_ID_CURR': 'first',  # Since SK_ID_CURR is the same for a given SK_ID_PREV
    'MONTHS_BALANCE': lambda x: abs(x.min() - x.max()),  # Duration calculation
    
    # Median and Mean for selected numerical features
    'AMT_BALANCE': ['median', 'mean'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['median', 'mean'],
    'AMT_DRAWINGS_OTHER_CURRENT': ['median', 'mean'],
    'AMT_DRAWINGS_POS_CURRENT': ['median', 'mean'],
    'SK_DPD': ['median', 'mean'],
    'SK_DPD_DEF': ['median', 'mean'],
    
    # Max for installment count
    'CNT_INSTALMENT_MATURE_CUM': 'max',
}

# Perform aggregation
ccard_agg = credit_card_clean.groupby('SK_ID_PREV').agg(agg_funcs).reset_index()

# Flatten multi-level column names
ccard_agg.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col for col in ccard_agg.columns]

# Rename the duration column explicitly
ccard_agg.rename(columns={'MONTHS_BALANCE_<lambda>': 'DURATION'}, inplace=True)

# Get the most recent NAME_CONTRACT_STATUS (based on max MONTHS_BALANCE)
latest_status = credit_card_clean.loc[credit_card_clean.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax(), 
                                ['SK_ID_PREV', 'NAME_CONTRACT_STATUS']]

# Merge with the aggregated data
ccard_agg = ccard_agg.merge(latest_status, on='SK_ID_PREV', how='left')

# Rename columns to include 'MEDIAN_' and 'MEAN_' prefixes
ccard_agg.rename(columns={
    'AMT_BALANCE_median': 'MEDIAN_AMT_BALANCE',
    'AMT_BALANCE_mean': 'MEAN_AMT_BALANCE',
    'AMT_CREDIT_LIMIT_ACTUAL_median': 'MEDIAN_AMT_CREDIT_LIMIT_ACTUAL',
    'AMT_CREDIT_LIMIT_ACTUAL_mean': 'MEAN_AMT_CREDIT_LIMIT_ACTUAL',
    'AMT_DRAWINGS_OTHER_CURRENT_median': 'MEDIAN_AMT_DRAWINGS_OTHER_CURRENT',
    'AMT_DRAWINGS_OTHER_CURRENT_mean': 'MEAN_AMT_DRAWINGS_OTHER_CURRENT',
    'AMT_DRAWINGS_POS_CURRENT_median': 'MEDIAN_AMT_DRAWINGS_POS_CURRENT',
    'AMT_DRAWINGS_POS_CURRENT_mean': 'MEAN_AMT_DRAWINGS_POS_CURRENT',
    'SK_DPD_median': 'MEDIAN_SK_DPD',
    'SK_DPD_mean': 'MEAN_SK_DPD',
    'SK_DPD_DEF_median': 'MEDIAN_SK_DPD_DEF',
    'SK_DPD_DEF_mean': 'MEAN_SK_DPD_DEF'
}, inplace=True)

# Display result
ccard_agg.head()


Unnamed: 0,SK_ID_PREV,SK_ID_CURR_first,DURATION,MEDIAN_AMT_BALANCE,MEAN_AMT_BALANCE,MEDIAN_AMT_CREDIT_LIMIT_ACTUAL,MEAN_AMT_CREDIT_LIMIT_ACTUAL,MEDIAN_AMT_DRAWINGS_OTHER_CURRENT,MEAN_AMT_DRAWINGS_OTHER_CURRENT,MEDIAN_AMT_DRAWINGS_POS_CURRENT,MEAN_AMT_DRAWINGS_POS_CURRENT,MEDIAN_SK_DPD,MEAN_SK_DPD,MEDIAN_SK_DPD_DEF,MEAN_SK_DPD_DEF,CNT_INSTALMENT_MATURE_CUM_max,NAME_CONTRACT_STATUS
0,1000018,394447,4,44360.503906,74946.289062,45000.0,81000.0,0.0,0.0,22827.330078,24078.996094,0.0,0.0,0.0,0.0,4.0,0.0
1,1000030,361282,7,48036.667969,55991.0625,78750.0,81562.5,0.0,0.0,13381.650146,16694.939453,0.0,0.0,0.0,0.0,5.0,0.0
2,1000031,131335,15,2902.747559,52394.4375,144000.0,149625.0,0.0,0.0,513.0,9290.833984,0.0,0.0,0.0,0.0,10.0,0.0
3,1000035,436351,4,0.0,0.0,225000.0,225000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000077,181153,10,0.0,0.0,135000.0,94090.909091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Installment Payments Table

In [12]:
installment_clean_df = installment_df.copy()
req.reduce_memory_usage(installment_clean_df)
req.dropna_over65(installment_clean_df)

cat_cols_ip = req.factorize_cat_cols(installment_clean_df)

req.imputing_na(installment_clean_df)

req.cap_outliers_sd(installment_clean_df)


Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%
There are: 0 columns missing data over 65%
Series([], dtype: float64)


Shape of the df after removing missing data over 65% : (13605401, 8)
Imputed all na values


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6.0,-1180.0,-1187.0,6948.359863,6948.359863
1,1330831,151639,0.0,34.0,-2156.0,-2156.0,1716.525024,1716.525024
2,2085231,193053,2.0,1.0,-63.0,-63.0,25425.000000,25425.000000
3,2452527,199697,1.0,3.0,-2418.0,-2426.0,24350.130859,24350.130859
4,2714724,167756,1.0,2.0,-1383.0,-1366.0,2165.040039,2160.584961
...,...,...,...,...,...,...,...,...
13605396,2186857,428057,0.0,66.0,-1624.0,-91.0,67.500000,9000.000000
13605397,1310347,414406,0.0,47.0,-1539.0,-91.0,67.500000,9000.000000
13605398,1308766,402199,0.0,43.0,-7.0,-91.0,43737.433594,9000.000000
13605399,1062206,409297,0.0,43.0,-1986.0,-91.0,67.500000,9000.000000


In [13]:
installment_clean_df = installment_clean_df.sort_values('SK_ID_PREV')
# Add DIFF_PAY_VS_INSTAL_DAY
installment_clean_df['DIFF_PAY_VS_INSTAL_DAY'] = installment_clean_df['DAYS_INSTALMENT'] - installment_clean_df['DAYS_ENTRY_PAYMENT']

# Add PAY_ONTIME
#1 <- ontime, 2 <- early, 3 <- late
installment_clean_df['PAY_ONTIME'] = installment_clean_df['DIFF_PAY_VS_INSTAL_DAY'].apply(
    lambda x: 1 if x == 0 else (2 if x > 0 else 3)
)
installment_clean_df['PAY_ONTIME']

# Add DIFF_PAY_VS_INSTAL_AMT
installment_clean_df['DIFF_PAY_VS_INSTAL_AMT'] = installment_clean_df['AMT_INSTALMENT'] - installment_clean_df['AMT_PAYMENT']

# Add ENOUGH_PAY
#1 <- enough, 2 <- less_pay, 3 <- more_pay
installment_clean_df['ENOUGH_PAY'] = installment_clean_df['DIFF_PAY_VS_INSTAL_AMT'].apply(
    lambda x: 1 if x == 0 else (2 if x > 0 else 3)
)

#dropping the other columns
installment_clean_df.drop(columns = ['DIFF_PAY_VS_INSTAL_DAY'], inplace = True)
installment_clean_df.drop(columns = ['DIFF_PAY_VS_INSTAL_AMT'], inplace = True)

# Display the updated DataFrame
installment_clean_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,PAY_ONTIME,ENOUGH_PAY
512588,1000001,158271,1.0,1.0,-268.0,-294.0,6404.310059,6404.310059,2,1
2159480,1000001,158271,2.0,2.0,-238.0,-244.0,62039.113281,62039.113281,2,1
3411021,1000002,101962,1.0,3.0,-1540.0,-1559.0,6264.0,6264.0,2,1
1214732,1000002,101962,1.0,1.0,-1600.0,-1611.0,6264.0,6264.0,2,1
1006868,1000002,101962,2.0,4.0,-1510.0,-1554.0,18443.564453,18443.564453,2,1


In [14]:
# Use .mode() only once per column and convert to a dictionary for fast lookup
pay_ontime_mode = installment_clean_df.groupby('SK_ID_PREV')['PAY_ONTIME'].agg(lambda x: pd.Series.mode(x)[0] if not x.mode().empty else None)
enough_pay_mode = installment_clean_df.groupby('SK_ID_PREV')['ENOUGH_PAY'].agg(lambda x: pd.Series.mode(x)[0] if not x.mode().empty else None)

# Perform groupby and aggregation for other columns
instalment_agg = installment_clean_df.groupby('SK_ID_PREV').agg(
    SK_ID_CURR=('SK_ID_CURR', 'first'),
    max_num_instalment_version=('NUM_INSTALMENT_VERSION', 'max'),
    total_instalment_number=('NUM_INSTALMENT_NUMBER', 'max')
).reset_index()

# Merge pre-computed modes into the aggregated DataFrame
instalment_agg = instalment_agg.merge(pay_ontime_mode.rename('PAY_ONTIME'), on='SK_ID_PREV')
instalment_agg = instalment_agg.merge(enough_pay_mode.rename('ENOUGH_PAY'), on='SK_ID_PREV')

# Display the result
instalment_agg.head()



Unnamed: 0,SK_ID_PREV,SK_ID_CURR,max_num_instalment_version,total_instalment_number,PAY_ONTIME,ENOUGH_PAY
0,1000001,158271,2.0,2.0,2,1
1,1000002,101962,2.0,4.0,2,1
2,1000003,252457,1.0,3.0,2,1
3,1000004,260094,2.0,7.0,2,1
4,1000005,176456,1.0,10.0,2,1


In [15]:
#write to csv
instalment_agg.to_csv("installment_agg.csv", index = False)

## Previous Application

In [16]:
# Drop rows where contract type is "XNA"
previous_app_df = previous_app_df[previous_app_df['NAME_CONTRACT_TYPE'] != 'XNA']

# For 'NAME_CONTRACT_TYPE' being 'Cash loans' or 'Revolving loans', fill NAs with 0
previous_app_df.loc[previous_app_df['NAME_CONTRACT_TYPE'].isin(['Cash loans', 'Revolving loans']), 'AMT_APPLICATION'] = previous_app_df.loc[previous_app_df['NAME_CONTRACT_TYPE'].isin(['Cash loans', 'Revolving loans']), 'AMT_APPLICATION'].fillna(0)

# For 'NAME_CONTRACT_TYPE' being 'Consumer loans', fill missing values in 'AMT_APPLICATION' with the median
consumer_loan_median = previous_app_df[previous_app_df['NAME_CONTRACT_TYPE'] == 'Consumer loans']['AMT_APPLICATION'].median()
previous_app_df.loc[previous_app_df['NAME_CONTRACT_TYPE'] == 'Consumer loans', 'AMT_APPLICATION'] = previous_app_df.loc[previous_app_df['NAME_CONTRACT_TYPE'] == 'Consumer loans', 'AMT_APPLICATION'].fillna(consumer_loan_median)

# Fill missing values in 'NAME_TYPE_SUITE' with 'Other'
previous_app_df['NAME_TYPE_SUITE'].fillna('Other', inplace=True)

# Drop the 'PRODUCT_COMBINATION' column
previous_app_df.drop(columns=['PRODUCT_COMBINATION'], inplace=True)

In [57]:
previous_app_clean = previous_app_df.copy()
req.reduce_memory_usage(previous_app_clean)
req.dropna_over65(previous_app_clean)

cat_cols_prev = req.factorize_cat_cols(previous_app_clean)

req.redundant_data(previous_app_clean)
req.imputing_na(previous_app_clean)

req.cap_outliers_sd(previous_app_clean)
previous_app_clean.head()


Memory usage of dataframe is 471.38 MB
Memory usage after optimization is: 308.95 MB
Decreased by 34.5%
There are: 2 columns missing data over 65%
RATE_INTEREST_PRIVILEGED    99.643625
RATE_INTEREST_PRIMARY       99.643625
dtype: float64


Shape of the df after removing missing data over 65% : (1669868, 34)
Dropped 6 redundant columns
The columns names that were dropped are :['AMT_GOODS_PRICE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_TERMINATION', 'NAME_PORTFOLIO', 'AMT_CREDIT', 'AMT_APPLICATION']


Shape of the dataset after removing multicolinearitly: (1669868, 28)
Imputed all na values


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_DOWN_PAYMENT,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,...,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,0,1730.430054,0.0,0,15.0,0.0,1.0,0.0,...,0,0.0,35.0,0.0,12.0,0,365243.0,-42.0,-42.0,0.0
1,2802425,108129,1,25188.615234,0.0,1,11.0,0.0,1.0,0.0,...,1,1.0,-1.0,1.0,36.0,1,365243.0,-134.0,365243.0,1.0
2,2523466,122040,1,15060.735352,0.0,2,11.0,0.0,1.0,0.0,...,1,2.0,-1.0,1.0,12.0,2,365243.0,-271.0,365243.0,1.0
3,2819243,176158,1,47041.335938,0.0,3,7.0,0.0,1.0,0.0,...,1,2.0,-1.0,1.0,12.0,0,365243.0,-482.0,-182.0,1.0
4,1784265,202054,1,31924.394531,0.0,1,9.0,0.0,1.0,0.0,...,2,2.0,-1.0,1.0,24.0,2,365243.0,365243.0,365243.0,0.0


In [18]:
previous_app_clean.shape

(1669868, 28)

## Joining to previous dfs for agg

In [19]:
installment_agg = pd.read_csv("installment_agg.csv")

In [20]:
# Perform left join with ccard_agg using SK_ID_PREV
joined_prev = previous_app_df.merge(ccard_agg, on="SK_ID_PREV", how="left")

# Perform left join with instalment_agg using SK_ID_PREV
joined_prev = joined_prev.merge(installment_agg, on="SK_ID_PREV", how="left")

In [21]:
# Drop the duplicate SK_ID_CURR columns
joined_prev = joined_prev.drop(columns=["SK_ID_CURR_first", "SK_ID_CURR_y"], errors = 'ignore')
joined_prev = joined_prev.drop(columns=["NAME_CONTRACT_STATUS_y"], errors = 'ignore')

## Aggregation on the Joined df

In [22]:
joined_prev.sort_values('SK_ID_CURR_x')
# Explicitly convert numeric columns
numeric_cols = [
    "AMT_ANNUITY",  
    "CNT_PAYMENT"
]

for col in numeric_cols:
    joined_prev[col] = pd.to_numeric(joined_prev[col], errors='coerce')

# Simplified aggregation dictionary (added mean directly to agg_dict)
agg_dict = {
    "SK_ID_PREV": "count",
    "AMT_ANNUITY": ["median", "sum"],
    "CNT_PAYMENT": "median",
    "NAME_CONTRACT_TYPE": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    "NAME_CASH_LOAN_PURPOSE": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    "NAME_PAYMENT_TYPE": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    "MEDIAN_AMT_BALANCE": "median",
    "MEDIAN_AMT_CREDIT_LIMIT_ACTUAL": "median",
    "MEDIAN_AMT_DRAWINGS_OTHER_CURRENT": "median",
    "MEDIAN_AMT_DRAWINGS_POS_CURRENT": "median",
    "MEDIAN_SK_DPD": "median",
    "MEDIAN_SK_DPD_DEF": "median",
    "max_num_instalment_version": "mean",
    "total_instalment_number": "mean",
    "PAY_ONTIME": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    "ENOUGH_PAY": lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    # Add the mean aggregation directly for those specific columns
    "MEAN_AMT_BALANCE": "mean",
    "MEAN_AMT_CREDIT_LIMIT_ACTUAL": "mean",
    "MEAN_AMT_DRAWINGS_OTHER_CURRENT": "mean",
    "MEAN_AMT_DRAWINGS_POS_CURRENT": "mean",
    "MEAN_SK_DPD": "mean",
    "MEAN_SK_DPD_DEF": "mean"
}

# Perform the aggregation
joined_prev_agg = joined_prev.groupby('SK_ID_CURR_x').agg(agg_dict).reset_index()

# Flatten MultiIndex columns after aggregation
joined_prev_agg.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col for col in joined_prev_agg.columns]

# Display the result
joined_prev_agg.head()


Unnamed: 0,SK_ID_CURR_x,SK_ID_PREV_count,AMT_ANNUITY_median,AMT_ANNUITY_sum,CNT_PAYMENT_median,NAME_CONTRACT_TYPE_<lambda>,NAME_CASH_LOAN_PURPOSE_<lambda>,NAME_PAYMENT_TYPE_<lambda>,MEDIAN_AMT_BALANCE_median,MEDIAN_AMT_CREDIT_LIMIT_ACTUAL_median,...,max_num_instalment_version_mean,total_instalment_number_mean,PAY_ONTIME_<lambda>,ENOUGH_PAY_<lambda>,MEAN_AMT_BALANCE_mean,MEAN_AMT_CREDIT_LIMIT_ACTUAL_mean,MEAN_AMT_DRAWINGS_OTHER_CURRENT_mean,MEAN_AMT_DRAWINGS_POS_CURRENT_mean,MEAN_SK_DPD_mean,MEAN_SK_DPD_DEF_mean
0,100001,1,3951.0,3951.0,8.0,Consumer loans,XAP,Cash through the bank,,,...,2.0,4.0,2.0,1.0,,,,,,
1,100002,1,9251.775,9251.775,24.0,Consumer loans,XAP,XNA,,,...,2.0,19.0,2.0,1.0,,,,,,
2,100003,3,64567.665,169661.97,12.0,Consumer loans,XAP,Cash through the bank,,,...,1.333333,8.333333,2.0,1.0,,,,,,
3,100004,1,5357.25,5357.25,4.0,Consumer loans,XAP,Cash through the bank,,,...,2.0,3.0,2.0,1.0,,,,,,
4,100005,2,4813.2,4813.2,12.0,Cash loans,XAP,Cash through the bank,,,...,2.0,9.0,2.0,1.0,,,,,,


In [23]:
joined_prev_agg.rename(columns ={'SK_ID_CURR_x': 'SK_ID_CURR'}, inplace = True)

In [24]:
joined_prev_agg.to_csv("joined_prev_agg.csv", index = False)

## JOINING ALL DATAFRAMES

In [25]:
joined_prev_agg = pd.read_csv("joined_prev_agg.csv")

In [26]:
merged_df = pd.merge(application_clean_df,joined_prev_agg, on = 'SK_ID_CURR', how = "left")
merged_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,max_num_instalment_version_mean,total_instalment_number_mean,PAY_ONTIME_<lambda>,ENOUGH_PAY_<lambda>,MEAN_AMT_BALANCE_mean,MEAN_AMT_CREDIT_LIMIT_ACTUAL_mean,MEAN_AMT_DRAWINGS_OTHER_CURRENT_mean,MEAN_AMT_DRAWINGS_POS_CURRENT_mean,MEAN_SK_DPD_mean,MEAN_SK_DPD_DEF_mean
0,100002,1,0.0,0,0,0,0,202500.0,406597.5,24700.5,...,2.0,19.0,2.0,1.0,,,,,,
1,100003,0,0.0,1,0,1,0,270000.0,1293502.5,35698.5,...,1.333333,8.333333,2.0,1.0,,,,,,
2,100004,0,0.97574,0,1,0,0,67500.0,135000.0,6750.0,...,2.0,3.0,2.0,1.0,,,,,,
3,100006,0,0.0,1,0,0,0,135000.0,312682.5,29686.5,...,1.666667,5.333333,2.0,1.0,0.0,270000.0,0.0,0.0,0.0,0.0
4,100007,0,0.0,0,0,0,0,121500.0,513000.0,21865.5,...,1.2,12.6,2.0,1.0,,,,,,


In [27]:
print(cat_cols_cc.dtype)

object


In [28]:
merged_clean_df= merged_df.copy()  
req.reduce_memory_usage(merged_clean_df)
req.dropna_over65(merged_clean_df)

cat_cols_merged = pd.Index([]).append(cat_cols_app).append(cat_cols_prev).append(cat_cols_ip).append(cat_cols_cc)

req.redundant_data(merged_clean_df)
req.imputing_na(merged_clean_df)
merged_final_df = req.remove_duplicates(merged_clean_df)

Memory usage of dataframe is 174.49 MB
Memory usage after optimization is: 65.11 MB
Decreased by 62.7%
There are: 12 columns missing data over 65%
MEAN_SK_DPD_DEF_mean                        74.656516
MEDIAN_SK_DPD_median                        74.656516
MEAN_SK_DPD_mean                            74.656516
MEAN_AMT_DRAWINGS_POS_CURRENT_mean          74.656516
MEAN_AMT_DRAWINGS_OTHER_CURRENT_mean        74.656516
MEAN_AMT_CREDIT_LIMIT_ACTUAL_mean           74.656516
MEAN_AMT_BALANCE_mean                       74.656516
MEDIAN_AMT_BALANCE_median                   74.656516
MEDIAN_AMT_CREDIT_LIMIT_ACTUAL_median       74.656516
MEDIAN_AMT_DRAWINGS_OTHER_CURRENT_median    74.656516
MEDIAN_AMT_DRAWINGS_POS_CURRENT_median      74.656516
MEDIAN_SK_DPD_DEF_median                    74.656516
dtype: float64


Shape of the df after removing missing data over 65% : (307511, 86)
Dropped 0 redundant columns
The columns names that were dropped are :[]


Shape of the dataset after removing multicolin

In [29]:
merged_final_df.shape

(307511, 76)

In [30]:
merged_final_df.to_csv("merged_final.csv", index=False)

In [31]:
merged_final_df = pd.read_csv("merged_final.csv")

In [32]:
## Splitting train data

In [33]:
#Split the data into training and test sets (80% train, 20% test)
# Drop the target variable (TARGET)
X = merged_final_df.drop(columns=['TARGET'])
y = merged_final_df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shape of the splits
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")



Training set shape: (246008, 75)
Test set shape: (61503, 75)


In [34]:
X_train, y_train = req.under_sample(X_train, y_train)

Class Imbalance Fixed


In [53]:
X.select_dtypes(exclude=[np.number]).head()

Unnamed: 0,NAME_CONTRACT_TYPE_<lambda>,NAME_CASH_LOAN_PURPOSE_<lambda>,NAME_PAYMENT_TYPE_<lambda>
0,Consumer loans,XAP,XNA
1,Consumer loans,XAP,Cash through the bank
2,Consumer loans,XAP,Cash through the bank
3,Cash loans,XNA,XNA
4,Cash loans,XNA,Cash through the bank


In [60]:
scaler = StandardScaler()
req.scale_data(X_train, scaler=scaler, is_train=True, exclude_columns = cat_cols_merged)

Columns to standardize: ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'YEARS_BEGINEXPLUATATION_AVG', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 

ValueError: could not convert string to float: 'Consumer loans'

In [55]:
X_train_log,X_test_log = req.imp_features_log(X_train.copy().drop(columns = ['SK_ID_CURR']), y_train.copy(),X_test.copy()) 

ValueError: could not convert string to float: 'Consumer loans'

In [None]:
X_train_rfc, X_test_rfc = req.imp_features_rfc(X_train.copy().drop(columns =['SK_ID_CURR']), y_train.copy(), X_test.copy())

## MODELLING

In [None]:
#GridSearchCV for KNN took too much processing power, therefore we will use RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the model
knn_model = KNeighborsClassifier()

# Define hyperparameter grid
param_dist = {
    'n_neighbors': list(range(1, 31)),  # Testing different values for k
    'weights': ['uniform', 'distance'],  # Weighting methods
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metrics
}
X_train_log['SK_ID_CURR']= X_train['SK_ID_CURR']
X_test_log['SK_ID_CURR']= X_test['SK_ID_CURR']
# Apply RandomizedSearchCV with cross-validation
random_search_knn = RandomizedSearchCV(knn_model, param_distributions=param_dist, 
                                       n_iter=5, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_knn.fit(X_train_log, y_train)  # Corrected train variables

# Best parameters and model
best_knn = random_search_knn.best_estimator_
print(f"Best parameters: {random_search_knn.best_params_}")

# Predict on the test set
y_pred_knn = best_knn.predict(X_test_log)  # Corrected test variable

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn:.4f}")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_knn)

# Plot Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("KNN Confusion Matrix")
plt.show()

In [None]:
#GridSearchCV for KNN took too much processing power, therefore we will use RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the model
knn_model = KNeighborsClassifier()

# Define hyperparameter grid
param_dist = {
    'n_neighbors': list(range(1, 31)),  # Testing different values for k
    'weights': ['uniform', 'distance'],  # Weighting methods
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metrics
}
X_train_rfc['SK_ID_CURR']= X_train['SK_ID_CURR']
X_test_rfc['SK_ID_CURR']= X_test['SK_ID_CURR']
# Apply RandomizedSearchCV with cross-validation
random_search_knn = RandomizedSearchCV(knn_model, param_distributions=param_dist, 
                                       n_iter=5, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_knn.fit(X_train_rfc, y_train)  # Corrected train variables

# Best parameters and model
best_knn = random_search_knn.best_estimator_
print(f"Best parameters: {random_search_knn.best_params_}")

# Predict on the test set
y_pred_knn = best_knn.predict(X_test_rfc)  # Corrected test variable

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn:.4f}")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_knn)

# Plot Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("KNN Confusion Matrix")
plt.show()

In [None]:
## random forest model

In [None]:
## logistic reg model

In [None]:
## roc curve

## Cleaning Test Data

In [None]:
# application_test_clean = application_test_df.copy()
# eq.reduce_memory_usage(application_test_clean)
# req.factorize_EXT_SOURCE(application_test_clean)
# req.bin_CAR_AGE(application_test_clean)
# req.fill_OCCUPATION_col(application_test_clean)
# req.dropna_over65(application_test_clean)

# cat_cols = req.factorize_cat_cols(application_test_clean)

# req.redundant_data(application_test_clean)
# req.imputing_na(application_test_clean)
# req.standardize_children_col(application_test_clean)

# req.cap_outliers_sd(application_test_df, exclude_columns = ['CNT_CHILDREN', 'TARGET'])
#  application_test_clean = application_test_final

In [None]:
# scaler = StandardScaler()
# req.scale_data(X_train, scaler=scaler, is_train=False, exclude_columns = cat_cols_merged)