In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from scipy import stats

In [2]:
col_desc = pd.read_csv("../src/columns_description.csv",sep=",", encoding='cp1252', index_col=0)
col_desc.head()

Unnamed: 0,Table,Row,Description,Special
1,application_data,SK_ID_CURR,ID of loan in our sample,
2,application_data,TARGET,Target variable (1 - client with payment diffi...,
5,application_data,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
6,application_data,CODE_GENDER,Gender of the client,
7,application_data,FLAG_OWN_CAR,Flag if the client owns a car,


# Column description of application data

In [3]:
col_desc_app = col_desc[col_desc['Table'] == "application_data"]
col_desc_app = col_desc_app.drop("Table", axis=1)
col_desc_app.set_index("Row", inplace=True)
col_desc_app

Unnamed: 0_level_0,Description,Special
Row,Unnamed: 1_level_1,Unnamed: 2_level_1
SK_ID_CURR,ID of loan in our sample,
TARGET,Target variable (1 - client with payment diffi...,
NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
CODE_GENDER,Gender of the client,
FLAG_OWN_CAR,Flag if the client owns a car,
...,...,...
AMT_REQ_CREDIT_BUREAU_DAY,Number of enquiries to Credit Bureau about the...,
AMT_REQ_CREDIT_BUREAU_WEEK,Number of enquiries to Credit Bureau about the...,
AMT_REQ_CREDIT_BUREAU_MON,Number of enquiries to Credit Bureau about the...,
AMT_REQ_CREDIT_BUREAU_QRT,Number of enquiries to Credit Bureau about the...,


In [4]:
# use to get full description
col_desc_app.loc['AMT_REQ_CREDIT_BUREAU_HOUR']['Description']

'Number of enquiries to Credit Bureau about the client one hour before application'

# Application Data

In [5]:
app_data = pd.read_csv("../src/application_data.csv",sep=",", encoding='cp1252', index_col=0)
app_data

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,,,,,,
456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,,,,,,
456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


### Columns NaNs where they might be an actual value
* AMT_ANNUITY (12) might be 0
* OWN_CAR_AGE (202929) might not own car
* CNT_FAM_MEMBERS (2) might be 0
* APARTMENTS_AVG (156061) might not own apt
* BASEMENTAREA_AVG (179943) might not own basement
* YEARS_BEGINEXPLUATATION_AVG (150007) might not own apt
* YEARS_BUILD_AVG (204488) might not own apt
* COMMONAREA_AVG 214865 might not have commonarea
* ELEVATORS_AVG 163891 might be homeless
* ENTRANCES_AVG 154828 might be homeless
* FLOORSMAX_AVG 153020
* FLOORSMIN_AVG 208642
* LANDAREA_AVG 182590
* LIVINGAPARTMENTS_AVG 210199
* LIVINGAREA_AVG 154350
* NONLIVINGAPARTMENTS_AVG 213514
* NONLIVINGAREA_AVG 169682
* APARTMENTS_MODE 156061
* BASEMENTAREA_MODE 179943
* YEARS_BEGINEXPLUATATION_MODE 150007
* YEARS_BUILD_MODE 204488
* COMMONAREA_MODE 214865
* ELEVATORS_MODE 163891
* ENTRANCES_MODE 154828
* FLOORSMAX_MODE 153020
* FLOORSMIN_MODE 208642
* LANDAREA_MODE 182590
* LIVINGAPARTMENTS_MODE 210199
* LIVINGAREA_MODE 154350
* NONLIVINGAPARTMENTS_MODE 213514
* NONLIVINGAREA_MODE 169682
* APARTMENTS_MEDI 156061
* BASEMENTAREA_MEDI 179943
* YEARS_BEGINEXPLUATATION_MEDI 150007
* YEARS_BUILD_MEDI 204488
* COMMONAREA_MEDI 214865
* ELEVATORS_MEDI 163891
* ENTRANCES_MEDI 154828
* FLOORSMAX_MEDI 153020
* FLOORSMIN_MEDI 208642
* LANDAREA_MEDI 182590
* LIVINGAPARTMENTS_MEDI 210199
* LIVINGAREA_MEDI 154350
* NONLIVINGAPARTMENTS_MEDI 213514
* NONLIVINGAREA_MEDI 169682
* FONDKAPREMONT_MODE 210295
* HOUSETYPE_MODE 154297
* TOTALAREA_MODE 148431
* WALLSMATERIAL_MODE 156341
* EMERGENCYSTATE_MODE 145755
* OBS_30_CNT_SOCIAL_CIRCLE 1021 ?
* DEF_30_CNT_SOCIAL_CIRCLE 1021 ?
* OBS_60_CNT_SOCIAL_CIRCLE 1021 ?
* DEF_60_CNT_SOCIAL_CIRCLE 1021 ?
* DAYS_LAST_PHONE_CHANGE 1 ?
* AMT_REQ_CREDIT_BUREAU_HOUR 41519 ?
* AMT_REQ_CREDIT_BUREAU_DAY 41519 ?
* AMT_REQ_CREDIT_BUREAU_WEEK 41519 ?
* AMT_REQ_CREDIT_BUREAU_MON 41519 ?
* AMT_REQ_CREDIT_BUREAU_QRT 41519 ?
* AMT_REQ_CREDIT_BUREAU_YEAR 41519 ?

### Useless columns
* EXT_SOURCE_1 unknown what it means
* EXT_SOURCE_2 -
* EXT_SOURCE_3 -

In [None]:
# Misschien kunnen we dit gebruiken, ook om een simpele basecase te vinden
columns = app_data.columns.tolist()[1:]
sumErr = 0
for i in range(len(columns)):
    col = columns[i]

    bubble_data = app_data.groupby(['TARGET', col]).size().reset_index(name='Frequency')
    bubble_size = bubble_data['Frequency'] * 0.2

    plt.figure(figsize=(10, 6))
    plt.scatter(bubble_data['TARGET'], bubble_data[col], s=bubble_size, alpha=0.5)

    plt.xlabel('TARGET')
    plt.ylabel(col)
    plt.title(f'Bubble Chart of TARGET vs {col} (Frequency)')
    plt.show()

# Column Description of Previous_data

In [6]:
col_desc_prev = col_desc[col_desc['Table'] == "application_data"]
col_desc_prev = col_desc_prev.drop("Table", axis=1)
col_desc_prev.set_index("Row", inplace=True)
col_desc_prev

Unnamed: 0_level_0,Description,Special
Row,Unnamed: 1_level_1,Unnamed: 2_level_1
SK_ID_CURR,ID of loan in our sample,
TARGET,Target variable (1 - client with payment diffi...,
NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
CODE_GENDER,Gender of the client,
FLAG_OWN_CAR,Flag if the client owns a car,
...,...,...
AMT_REQ_CREDIT_BUREAU_DAY,Number of enquiries to Credit Bureau about the...,
AMT_REQ_CREDIT_BUREAU_WEEK,Number of enquiries to Credit Bureau about the...,
AMT_REQ_CREDIT_BUREAU_MON,Number of enquiries to Credit Bureau about the...,
AMT_REQ_CREDIT_BUREAU_QRT,Number of enquiries to Credit Bureau about the...,


# Columns That may be usefull.
1. TARGET | Overduidelijk het doel om te bepalen. of iemand schulden heeft of niet.
2. CODE_GENDER | Om Mischien een patroon te zien dat een bepaald gender vaker schulden 
3. FLAG_OWN_REALTY | Om Te kijken of er mischien een relatie met mensen in de schulden die al een woning bezitten.
4. CNT_CHILDREN | OM te kijken of Meer kinderen ook kan betekenen dat mensen sneller schulden / acterstallige betalingen hebben.
5. AMT_INCOME_TOTAL | Ja totale inkomen spreekt een beetje voorzich.
6. AMT_CREDIT | Hoeveel heid van lening. Mischien hoe groter de lening hoe meer kans op achterstand of juist niet.
7. AMT_ANNUITY | Mainer van terug betaling. Met cash meer problemen of per rekening ? 
8. NAME_EDUCATION_TYPE | Zou educatie invloed hebben op hoe mensen met geld omgaan.
9. NAME_FAMILY_STATUS | De staauts van familie. Singel/Stel/Getrouwed/ Daadwerkelijk gezin.
10. NAME_HOUSING_TYPE | De manier van wonen koop/Huur/Thuis wonend Wat voor invloed dat zou kunnen hebben
11. DAYS_BIRTH | Leeftijd van de persoon. Jongen mensen verkeerd met geld ? Oud verstandiger ?
12. FLAG_MOBIL | Is geen mobiel nummer hogere kans op problemen ?
13. FLAG_WORK_PHONE | Zelfde als voor de thuis telefoon 
14. FLAG_CONT_MOBILE | Werkte het mobiele nummer ook echt ? 
15. REG_REGION_NOT_LIVE_REGION | Kijken of iemand ook echt woont waar hij zegt dat hij woont regio level
16. REG_CITY_NOT_LIVE_CITY | Zelfde als boven op city level.

## KNN Model

In [7]:
test_model = app_data

### Test data opschonen

In [18]:
# Vervang genders met floats/ints
from sklearn.preprocessing import LabelEncoder

# De gender naar floats/ints omzetten zodat die gebruikt kunnen worden
# encoder = LabelEncoder()
# test_model['CODE_GENDER'] = encoder.fit_transform(test_model['CODE_GENDER'])

# Normaliseer de data.
columns = ['CODE_GENDER','AMT_INCOME_TOTAL','AMT_CREDIT']
for col in columns:
    test_model[f'{col}_NORM'] = (test_model[col]-test_model[col].mean())/test_model[col].std()
test_model.describe()



Unnamed: 0,TARGET,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CODE_GENDER norm,AMT_INCOME_TOTAL norm,AMT_CREDIT norm,CODE_GENDER_NORM,AMT_INCOME_TOTAL_NORM,AMT_CREDIT_NORM
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,265992.0,265992.0,265992.0,265992.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,0.080729,0.341669,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.034362,0.267395,0.265474,1.899974,-2.2805870000000002e-17,-2.2574810000000002e-17,-3.9892950000000005e-17,-2.2805870000000002e-17,-2.2574810000000002e-17,-3.9892950000000005e-17
std,0.272419,0.474297,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.204685,0.916002,0.794056,1.869295,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,-0.7203688,-0.603686,-1.376494,-0.7203688,-0.603686,-1.376494
25%,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,-0.7203688,-0.2374206,-0.8174746,-0.7203688,-0.2374206,-0.8174746
50%,0.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,1.0,-0.7203688,-0.09129399,-0.2124148,-0.7203688,-0.09129399,-0.2124148
75%,0.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,3.0,1.388013,0.142129,0.5208169,1.388013,0.142129,0.5208169
max,1.0,2.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,8.0,27.0,261.0,25.0,3.496394,492.7026,8.574045,3.496394,492.7026,8.574045


In [19]:
knn = KNeighborsClassifier()


In [20]:
X = test_model[['CODE_GENDER_NORM','AMT_INCOME_TOTAL_NORM','AMT_CREDIT_NORM']]
y = test_model['TARGET']

test_model.describe()

Unnamed: 0,TARGET,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CODE_GENDER norm,AMT_INCOME_TOTAL norm,AMT_CREDIT norm,CODE_GENDER_NORM,AMT_INCOME_TOTAL_NORM,AMT_CREDIT_NORM
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,265992.0,265992.0,265992.0,265992.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,0.080729,0.341669,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.034362,0.267395,0.265474,1.899974,-2.2805870000000002e-17,-2.2574810000000002e-17,-3.9892950000000005e-17,-2.2805870000000002e-17,-2.2574810000000002e-17,-3.9892950000000005e-17
std,0.272419,0.474297,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.204685,0.916002,0.794056,1.869295,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,-0.7203688,-0.603686,-1.376494,-0.7203688,-0.603686,-1.376494
25%,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,-0.7203688,-0.2374206,-0.8174746,-0.7203688,-0.2374206,-0.8174746
50%,0.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,1.0,-0.7203688,-0.09129399,-0.2124148,-0.7203688,-0.09129399,-0.2124148
75%,0.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,3.0,1.388013,0.142129,0.5208169,1.388013,0.142129,0.5208169
max,1.0,2.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,8.0,27.0,261.0,25.0,3.496394,492.7026,8.574045,3.496394,492.7026,8.574045


In [21]:
# splits in train en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [35]:
# Set hyper parameters
knn.set_params(n_neighbors = 4, weights = 'distance')

In [36]:
# Fit het model
knn.fit(X_train,y_train)

## Eerste Predictie


In [37]:
# Ons eerste predictie
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.9087645360181066

## BASELINE Model

In [25]:
# Baseline
y_naive = np.full(len(y_pred), 0)
print(f"Resultaatvector bij 'dom' voorspellen: {y_naive}\nBaseline score: {accuracy_score(y_test, y_naive)}")

Resultaatvector bij 'dom' voorspellen: [0 0 0 ... 0 0 0]
Baseline score: 0.9207705715549311


### Trainset score

In [16]:
y_train_pred = knn.predict(X_train)
print(f"Score bij voorspellen op de trainset: {accuracy_score(y_train, y_train_pred)}")

Score bij voorspellen op de trainset: 0.9349962928115231
