## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

## Importing Datasets

In [2]:
application_record = pd.read_csv('application_record.csv')
credit_record = pd.read_csv('credit_record.csv')

In [3]:
application_record

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474.0,-1134.0,1.0,0.0,0.0,0.0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110.0,-3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110.0,-3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16895,5048267,F,N,Y,0,450000.0,Working,Secondary / secondary special,Married,House / apartment,-21480.0,-4660.0,1.0,1.0,0.0,0.0,,2.0
16896,5048268,F,N,Y,0,450000.0,Working,Secondary / secondary special,Married,House / apartment,-21480.0,-4660.0,1.0,1.0,0.0,0.0,,2.0
16897,5048269,F,N,Y,0,450000.0,Working,Secondary / secondary special,Married,House / apartment,-21480.0,-4660.0,1.0,1.0,0.0,0.0,,2.0
16898,5685812,F,N,Y,0,450000.0,Working,Secondary / secondary special,Married,House / apartment,-21480.0,-4660.0,1.0,1.0,0.0,0.0,,2.0


In [4]:
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C
...,...,...,...
143189,5011572,-1,X
143190,5011572,-2,X
143191,5011572,-3,X
143192,5011572,-4,X


## Creating The Target Column

In [5]:
# Replace X,C values with 0 as they are identified as Good clients
credit_record.replace(['X','C'], 0,inplace=True)

In [6]:
credit_record.STATUS = pd.to_numeric(credit_record.STATUS)

In [7]:
# Searching for customers who have at least one late month
drop_ls = []
for i in range(len(credit_record)):
    if credit_record.STATUS[i] != 0:
        drop_ls.append(credit_record.ID[i])

In [8]:
len(drop_ls)

1391

In [9]:
# Changing the STATUS of any client with at least one late month to 1
for i in range(len(credit_record)):
        if credit_record.ID[i] in drop_ls:
            credit_record.STATUS[i] = 1

In [10]:
credit_record.STATUS.value_counts()

0    124726
1     18468
Name: STATUS, dtype: int64

In [11]:
credit_record.drop_duplicates(inplace=True)
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,0
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,0
...,...,...,...
143189,5011572,-1,0
143190,5011572,-2,0
143191,5011572,-3,0
143192,5011572,-4,0


## Merging Datasets

In [12]:
print(f'No. of IDs in application_record = {len(application_record.ID)} No. of IDs in credit_record = {len(credit_record.ID)}')

No. of IDs in application_record = 16900 No. of IDs in credit_record = 143194


In [13]:
dataset = application_record.merge(credit_record, on=['ID'], how='inner')
 # on to choose which column to merger on
 # How to get merge only the intersection between them

In [14]:
dataset.drop(['ID'],inplace=True,axis=1)

In [15]:
dataset.duplicated().sum()

21337

In [16]:
dataset.drop_duplicates(inplace=True)

In [17]:
dataset

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,-1,1
2,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,-2,1
3,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,-3,1
4,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,-4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37509,F,N,Y,0,315000.0,Pensioner,Secondary / secondary special,Married,Municipal apartment,-20298.0,365243.0,1.0,0.0,0.0,0.0,,2.0,-41,0
37604,F,N,Y,0,315000.0,Pensioner,Secondary / secondary special,Married,Municipal apartment,-20298.0,365243.0,1.0,0.0,0.0,0.0,,2.0,-42,0
37605,F,N,Y,0,315000.0,Pensioner,Secondary / secondary special,Married,Municipal apartment,-20298.0,365243.0,1.0,0.0,0.0,0.0,,2.0,-43,0
37606,F,N,Y,0,315000.0,Pensioner,Secondary / secondary special,Married,Municipal apartment,-20298.0,365243.0,1.0,0.0,0.0,0.0,,2.0,-44,0


## Getting Data's Information and Description

In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16271 entries, 0 to 37607
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CODE_GENDER          16271 non-null  object 
 1   FLAG_OWN_CAR         16271 non-null  object 
 2   FLAG_OWN_REALTY      16271 non-null  object 
 3   CNT_CHILDREN         16271 non-null  int64  
 4   AMT_INCOME_TOTAL     16271 non-null  float64
 5   NAME_INCOME_TYPE     16271 non-null  object 
 6   NAME_EDUCATION_TYPE  16271 non-null  object 
 7   NAME_FAMILY_STATUS   16271 non-null  object 
 8   NAME_HOUSING_TYPE    16271 non-null  object 
 9   DAYS_BIRTH           16271 non-null  float64
 10  DAYS_EMPLOYED        16271 non-null  float64
 11  FLAG_MOBIL           16271 non-null  float64
 12  FLAG_WORK_PHONE      16271 non-null  float64
 13  FLAG_PHONE           16271 non-null  float64
 14  FLAG_EMAIL           16271 non-null  float64
 15  OCCUPATION_TYPE      11707 non-null 

In [19]:
dataset.describe()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
count,16271.0,16271.0,16271.0,16271.0,16271.0,16271.0,16271.0,16271.0,16271.0,16271.0,16271.0
mean,0.364022,195110.6,-15980.98316,54429.178539,1.0,0.240182,0.333907,0.201524,2.180874,-21.733083,0.189171
std,0.675814,120932.9,4127.718476,133451.636059,0.0,0.427207,0.471621,0.401151,0.830825,14.971417,0.391656
min,0.0,40500.0,-23768.0,-10936.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,0.0,121500.0,-19692.0,-3386.0,1.0,0.0,0.0,0.0,2.0,-33.0,0.0
50%,0.0,157500.0,-15519.0,-1773.0,1.0,0.0,0.0,0.0,2.0,-20.0,0.0
75%,1.0,238500.0,-12500.0,-556.0,1.0,0.0,1.0,0.0,2.0,-9.0,0.0
max,3.0,1350000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,5.0,0.0,1.0


In [20]:
dataset.isna().sum()

CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
DAYS_BIRTH                0
DAYS_EMPLOYED             0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE        4564
CNT_FAM_MEMBERS           0
MONTHS_BALANCE            0
STATUS                    0
dtype: int64

In [21]:
dataset.isna().sum().sum()

4564

### Displaying Column "OCCUPATION_TYPE" with NULL Values

In [22]:
dataset.OCCUPATION_TYPE

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
37509    NaN
37604    NaN
37605    NaN
37606    NaN
37607    NaN
Name: OCCUPATION_TYPE, Length: 16271, dtype: object

In [23]:
dataset.OCCUPATION_TYPE.value_counts()

Laborers                 2380
Sales staff              2009
Managers                 1577
Core staff               1490
Drivers                  1368
Accountants               577
High skill tech staff     518
Medicine staff            442
Security staff            415
Cleaning staff            275
Cooking staff             268
Secretaries               154
Low-skill Laborers         99
Waiters/barmen staff       69
HR staff                   47
Private service staff      19
Name: OCCUPATION_TYPE, dtype: int64

### Replacing Null Values in 'OCCUPATION_TYPE' with Not Employed

In [24]:
dataset.OCCUPATION_TYPE.replace(np.nan, 'Other', inplace = True)

In [25]:
dataset.OCCUPATION_TYPE.value_counts()

Other                    4564
Laborers                 2380
Sales staff              2009
Managers                 1577
Core staff               1490
Drivers                  1368
Accountants               577
High skill tech staff     518
Medicine staff            442
Security staff            415
Cleaning staff            275
Cooking staff             268
Secretaries               154
Low-skill Laborers         99
Waiters/barmen staff       69
HR staff                   47
Private service staff      19
Name: OCCUPATION_TYPE, dtype: int64

## Label Encoding

In [26]:
dataset.dtypes

CODE_GENDER             object
FLAG_OWN_CAR            object
FLAG_OWN_REALTY         object
CNT_CHILDREN             int64
AMT_INCOME_TOTAL       float64
NAME_INCOME_TYPE        object
NAME_EDUCATION_TYPE     object
NAME_FAMILY_STATUS      object
NAME_HOUSING_TYPE       object
DAYS_BIRTH             float64
DAYS_EMPLOYED          float64
FLAG_MOBIL             float64
FLAG_WORK_PHONE        float64
FLAG_PHONE             float64
FLAG_EMAIL             float64
OCCUPATION_TYPE         object
CNT_FAM_MEMBERS        float64
MONTHS_BALANCE           int64
STATUS                   int64
dtype: object

In [27]:
dataset.isnull().sum()

CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
MONTHS_BALANCE         0
STATUS                 0
dtype: int64

In [32]:
dataset.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,Other,2.0,0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,Other,2.0,-1,1
2,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,Other,2.0,-2,1
3,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,Other,2.0,-3,1
4,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,Other,2.0,-4,1


In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [34]:
dataset['CODE_GENDER'] = le.fit_transform(dataset['CODE_GENDER'])
dataset['FLAG_OWN_CAR'] = le.fit_transform(dataset['FLAG_OWN_CAR'])
dataset['FLAG_OWN_REALTY'] = le.fit_transform(dataset['FLAG_OWN_REALTY'])
dataset['NAME_INCOME_TYPE'] = le.fit_transform(dataset['NAME_INCOME_TYPE'])
dataset['NAME_EDUCATION_TYPE'] = le.fit_transform(dataset['NAME_EDUCATION_TYPE'])
dataset['NAME_FAMILY_STATUS'] = le.fit_transform(dataset['NAME_FAMILY_STATUS'])
dataset['NAME_HOUSING_TYPE'] = le.fit_transform(dataset['NAME_HOUSING_TYPE'])
dataset['OCCUPATION_TYPE'] = le.fit_transform(dataset['OCCUPATION_TYPE'])

In [35]:
dataset.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
0,1,1,1,0,427500.0,4,1,0,4,-12005.0,-4542.0,1.0,1.0,0.0,0.0,11,2.0,0,1
1,1,1,1,0,427500.0,4,1,0,4,-12005.0,-4542.0,1.0,1.0,0.0,0.0,11,2.0,-1,1
2,1,1,1,0,427500.0,4,1,0,4,-12005.0,-4542.0,1.0,1.0,0.0,0.0,11,2.0,-2,1
3,1,1,1,0,427500.0,4,1,0,4,-12005.0,-4542.0,1.0,1.0,0.0,0.0,11,2.0,-3,1
4,1,1,1,0,427500.0,4,1,0,4,-12005.0,-4542.0,1.0,1.0,0.0,0.0,11,2.0,-4,1


## Splitting the dataset

In [36]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Split the dataset into training and testing sets
X = dataset.drop('STATUS', axis=1).values
y = dataset['STATUS'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [39]:
# Predict the target values for the test data
y_pred = knn.predict(X_test)

In [40]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.780952380952381


In [41]:
# Evaluate the confusion matrix of the model
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", confusion_mat)

Confusion Matrix: 
 [[2246  355]
 [ 358  296]]


In [42]:
from sklearn.model_selection import GridSearchCV

In [48]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'metric' : ['euclidean','manhattan']}


In [49]:
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

In [50]:
# fit the model on our train set
g_res = gs.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [51]:
# find the best score
g_res.best_score_

0.8359710869281023

In [52]:
# get the hyperparameters with the best score
g_res.best_params_

{'metric': 'euclidean', 'n_neighbors': 13}