In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 125)
pd.set_option('display.max_columns', 125)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

pd.options.mode.chained_assignment = None

In [2]:
def check(data, idx=0) :
    col = data.columns.to_list()
    print(col[idx])
    print(data[col[idx]].value_counts())
    print('=============================')
    print(data[col[idx]].describe())

# Application Dataset

In [3]:
df_train = pd.read_csv("data/application_train.csv", index_col="SK_ID_CURR")
df_train.shape

(307511, 121)

In [4]:
df_train.head()

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df_train.TARGET.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [6]:
# Select feature with high importance and expetation
data = df_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'FLAG_DOCUMENT_4', 'NAME_INCOME_TYPE', 
                 'NAME_EDUCATION_TYPE','DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR', 
                 'DAYS_EMPLOYED', 'OWN_CAR_AGE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 
                 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE', 
                 'DAYS_REGISTRATION', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 
                 'FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 
                 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
                 'AMT_REQ_CREDIT_BUREAU_QRT']]

In [7]:
# Cleaning
data.drop(columns=['FLAG_DOCUMENT_4', 'FLAG_CONT_MOBILE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
                  'AMT_REQ_CREDIT_BUREAU_WEEK', ], inplace=True)

In [8]:
# change to year
data['DAYS_LAST_PHONE_CHANGE'] = abs(data.DAYS_LAST_PHONE_CHANGE/365)
data['DAYS_EMPLOYED'] = abs(data.DAYS_EMPLOYED/365)
data['DAYS_REGISTRATION'] = abs(data.DAYS_REGISTRATION/365)
data['DAYS_BIRTH'] = abs(data.DAYS_BIRTH/365)

In [9]:
# remove outlier
idx = data.loc[data.DAYS_EMPLOYED >= 16].index.to_list()
data.drop(index=idx, inplace=True)

idx = data.loc[data.OWN_CAR_AGE >= 30].index.to_list()
data.drop(index=idx, inplace=True)

idx = data.loc[data.OBS_30_CNT_SOCIAL_CIRCLE >= 5].index.to_list()
data.drop(index=idx, inplace=True)


idx = data.loc[data.DAYS_REGISTRATION >= 40].index.to_list()
data.drop(index=idx, inplace=True)

idx = data.loc[data.AMT_INCOME_TOTAL >= 5000000].index.to_list()
data.drop(index=idx, inplace=True)

idx = data.loc[data.CNT_FAM_MEMBERS >= 10].index.to_list()
data.drop(index=idx, inplace=True)

idx = data.loc[data.AMT_REQ_CREDIT_BUREAU_QRT >= 7].index.to_list()
data.drop(index=idx, inplace=True)

In [10]:
data.shape

(205154, 28)

In [11]:
data.describe()

Unnamed: 0,TARGET,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED,OWN_CAR_AGE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_REGISTRATION,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_BIRTH,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,AMT_REQ_CREDIT_BUREAU_QRT
count,205154.0,100427.0,204736.0,163003.0,205153.0,176538.0,176538.0,205154.0,75235.0,205154.0,205154.0,204391.0,204391.0,204391.0,204391.0,205154.0,205154.0,205154.0,205142.0,204936.0,205154.0,205154.0,205154.0,205152.0,176538.0
mean,0.089659,0.474011,0.5141484,0.495828,2.603724,0.276858,1.852491,5.045269,9.789074,2.038805,2.018547,0.822306,0.113063,0.811004,0.082748,12.306181,175953.6,605078.2,27778.364457,543690.0,39.706851,0.276612,0.065444,2.259193,0.260397
std,0.285694,0.20493,0.1907782,0.196403,2.24499,0.933864,1.823853,3.85692,6.53684,0.518614,0.512584,1.192876,0.367674,1.181971,0.308383,8.628387,103546.9,404288.5,14677.643337,371170.5,9.853474,0.447324,0.247307,0.946033,0.604853
min,0.0,0.014568,8.173617e-08,0.000527,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,25650.0,45000.0,1980.0,45000.0,21.021918,0.0,0.0,1.0,0.0
25%,0.0,0.310607,0.3951197,0.35234,0.767123,0.0,0.0,1.931507,4.0,2.0,2.0,0.0,0.0,0.0,0.0,4.852055,112500.0,273636.0,16983.0,243000.0,31.671233,0.0,0.0,2.0,0.0
50%,0.0,0.47068,0.5652522,0.515495,2.010959,0.0,1.0,4.043836,9.0,2.0,2.0,0.0,0.0,0.0,0.0,11.380822,157500.0,518562.0,25789.5,450000.0,38.969863,0.0,0.0,2.0,0.0
75%,0.0,0.635638,0.6628564,0.654529,4.246575,0.0,3.0,7.438356,14.0,2.0,2.0,1.0,0.0,1.0,0.0,18.410959,216000.0,814041.0,35581.5,688500.0,47.106849,1.0,0.0,3.0,0.0
max,1.0,0.94442,0.8549997,0.893976,11.758904,27.0,23.0,15.99726,29.0,3.0,3.0,4.0,4.0,4.0,4.0,39.989041,4500000.0,4050000.0,258025.5,4050000.0,68.953425,1.0,1.0,9.0,6.0


## Missing value

In [12]:
# Persentasi missing value pada data train
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing = pd.concat([total,percent], axis=1, keys=['Total', 'Percent'])
missing

Unnamed: 0,Total,Percent
OWN_CAR_AGE,129919,0.633275
EXT_SOURCE_1,104727,0.51048
EXT_SOURCE_3,42151,0.20546
OCCUPATION_TYPE,33452,0.163058
AMT_REQ_CREDIT_BUREAU_QRT,28616,0.139485
AMT_REQ_CREDIT_BUREAU_MON,28616,0.139485
AMT_REQ_CREDIT_BUREAU_YEAR,28616,0.139485
DEF_60_CNT_SOCIAL_CIRCLE,763,0.003719
DEF_30_CNT_SOCIAL_CIRCLE,763,0.003719
OBS_60_CNT_SOCIAL_CIRCLE,763,0.003719


In [13]:
# impute with zero 
data['OWN_CAR_AGE'] = data['OWN_CAR_AGE'].fillna(0) 
data['EXT_SOURCE_1'] = data['EXT_SOURCE_1'].fillna(0) 
data['EXT_SOURCE_2'] = data['EXT_SOURCE_2'].fillna(0) 
data['EXT_SOURCE_3'] = data['EXT_SOURCE_3'].fillna(0) 

# impute with mode
imp_modus = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

data = pd.DataFrame(imp_modus.fit_transform(data), columns=data.columns)
data.isnull().sum()

TARGET                         0
EXT_SOURCE_1                   0
EXT_SOURCE_2                   0
EXT_SOURCE_3                   0
NAME_INCOME_TYPE               0
NAME_EDUCATION_TYPE            0
DAYS_LAST_PHONE_CHANGE         0
AMT_REQ_CREDIT_BUREAU_MON      0
AMT_REQ_CREDIT_BUREAU_YEAR     0
DAYS_EMPLOYED                  0
OWN_CAR_AGE                    0
REGION_RATING_CLIENT           0
REGION_RATING_CLIENT_W_CITY    0
OBS_30_CNT_SOCIAL_CIRCLE       0
DEF_30_CNT_SOCIAL_CIRCLE       0
OBS_60_CNT_SOCIAL_CIRCLE       0
DEF_60_CNT_SOCIAL_CIRCLE       0
DAYS_REGISTRATION              0
AMT_INCOME_TOTAL               0
AMT_CREDIT                     0
AMT_ANNUITY                    0
AMT_GOODS_PRICE                0
DAYS_BIRTH                     0
FLAG_PHONE                     0
FLAG_EMAIL                     0
OCCUPATION_TYPE                0
CNT_FAM_MEMBERS                0
AMT_REQ_CREDIT_BUREAU_QRT      0
dtype: int64

In [14]:
data.shape

(205154, 28)

In [15]:
data.TARGET.value_counts()

0    186760
1     18394
Name: TARGET, dtype: int64

## Feature encoding (Cat to Num)

In [16]:
scale_mapper = {"Lower secondary":1, "Secondary / secondary special":2, "Incomplete higher":3, "Higher education":4,
               "Academic degree":5}

data.NAME_EDUCATION_TYPE = data.NAME_EDUCATION_TYPE.replace(scale_mapper)

In [17]:
data = pd.concat([data, pd.get_dummies(data.OCCUPATION_TYPE)], 1)
data = pd.concat([data, pd.get_dummies(data.NAME_INCOME_TYPE)], 1)

In [18]:
data.drop(columns=['OCCUPATION_TYPE', 'NAME_INCOME_TYPE'], inplace=True)

In [19]:
data = data.astype('float64')

### Scaling

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

In [22]:
X = pd.DataFrame(scaler.fit_transform(data.drop(columns=['TARGET'])), columns=data.drop(columns=['TARGET']).columns)
y = data.TARGET

In [23]:
X.shape, y.shape

((205154, 50), (205154,))

In [24]:
y.unique()

array([1., 0.])

## Undersampling

In [25]:
undersample = RandomUnderSampler(sampling_strategy='majority')

In [26]:
X_over, y_over = undersample.fit_resample(X, y)
X_over.shape, y_over.shape

((36788, 50), (36788,))

In [27]:
y_over.value_counts()

1.0    18394
0.0    18394
Name: TARGET, dtype: int64

## Dataset Split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, stratify=y_over, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((29430, 50), (7358, 50), (29430,), (7358,))

## Train model

In [44]:
from xgboost import XGBClassifier
model = XGBClassifier()

In [45]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.69      0.66      0.68      3679
         1.0       0.68      0.70      0.69      3679

    accuracy                           0.68      7358
   macro avg       0.68      0.68      0.68      7358
weighted avg       0.68      0.68      0.68      7358



In [47]:
print(accuracy_score(y_test, y_pred))

0.6819787985865724


In [48]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [49]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [50]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.56      0.58      3679
         1.0       0.59      0.65      0.62      3679

    accuracy                           0.60      7358
   macro avg       0.60      0.60      0.60      7358
weighted avg       0.60      0.60      0.60      7358



# Tuning

In [57]:
algo__n_neighbors = [ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
algo__weights = ['uniform', 'distance']
algo__p = [1, 1.5, 2]

hyperparameters = dict(n_neighbors=algo__n_neighbors, weights=algo__weights, p=algo__p)
hyperparameters

{'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29],
 'weights': ['uniform', 'distance'],
 'p': [1, 1.5, 2]}

In [58]:
knn = KNeighborsClassifier()

clf = RandomizedSearchCV(knn, hyperparameters, cv=5, verbose=2, n_jobs=-1)

In [59]:
#Fitting Model
best_model = clf.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [60]:
#Nilai hyperparameters terbaik
print('Best Penalty:', best_model.best_estimator_.get_params())

#Prediksi menggunakan model baru
y_pred = best_model.predict(X_test)

#Check performa dari model
print(classification_report(y_test, y_pred))
roc_auc_score(y_test, y_pred)

Best Penalty: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 23, 'p': 1.5, 'weights': 'uniform'}
              precision    recall  f1-score   support

         0.0       0.65      0.56      0.60      3679
         1.0       0.61      0.70      0.65      3679

    accuracy                           0.63      7358
   macro avg       0.63      0.63      0.63      7358
weighted avg       0.63      0.63      0.63      7358



0.6295188910029899