In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bank-credit-scoring/bank.csv
/kaggle/input/bank-credit-scoring/scoring.jpg


# Preprocessing

First, let's look at the data and preprocess it

In [2]:
df = pd.read_csv("/kaggle/input/bank-credit-scoring/bank.csv", sep=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


Let's see what categorial and numerical features we have and how much their values vary

In [3]:
cat_features = df.columns[df.dtypes == 'object']
num_features = df.columns[df.dtypes != 'object']
cat_features

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'y'],
      dtype='object')

In [4]:
data = [(feature, "categorial", df.loc[:, feature].nunique()) for feature in cat_features] + \
       [(feature, "numerical", f"({df.loc[:, feature].min()}; {df.loc[:, feature].max()})") for feature in num_features]

values_df = pd.DataFrame(
    data=data,
    columns=["feature", "type", "uniqie values / range"]
).set_index("feature").reindex(df.columns)

values_df

Unnamed: 0,type,uniqie values / range
age,numerical,(19; 87)
job,categorial,12
marital,categorial,3
education,categorial,4
default,categorial,2
balance,numerical,(-3313; 71188)
housing,categorial,2
loan,categorial,2
contact,categorial,3
day,numerical,(1; 31)


We see that there is not much variance in categorial data so we can use one hot encoding. The numerical values look normal, but we had better normilize the data.

In [5]:
from sklearn.preprocessing import MinMaxScaler


df2 = pd.get_dummies(
    data=df,
    columns=cat_features,
    drop_first=True,
    dtype="int")

scaler = MinMaxScaler()
df2[num_features] = scaler.fit_transform(df2[num_features])
df2

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,y_yes
0,0.161765,0.068455,0.600000,0.024826,0.000000,0.000000,0.00,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.205882,0.108750,0.333333,0.071500,0.000000,0.389908,0.16,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0.235294,0.062590,0.500000,0.059914,0.000000,0.379587,0.04,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.161765,0.064281,0.066667,0.064548,0.061224,0.000000,0.00,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0.588235,0.044469,0.133333,0.073486,0.000000,0.000000,0.00,1,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,0.205882,0.039999,0.966667,0.107580,0.081633,0.000000,0.00,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4517,0.558824,0.000000,0.266667,0.049321,0.000000,0.000000,0.00,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4518,0.558824,0.048429,0.600000,0.048659,0.204082,0.000000,0.00,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4519,0.132353,0.059731,0.166667,0.041377,0.061224,0.243119,0.12,1,0,0,...,0,0,0,0,0,0,1,0,0,0


Also let's split the data into train and test

In [6]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    df2.iloc[:, :-1].to_numpy(), df2.iloc[:, -1].to_numpy(), test_size=0.2)

# Classic ML models 

We will try to solve the task using KNN, Logistic Regression, Random Forest, Gradient Boosting.

## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

lr = LogisticRegression()
metrics = [accuracy_score, precision_score, recall_score]

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred_train = lr.predict(X_train)

data = [["Train"] + [m(y_train, y_pred_train) for m in metrics]] + \
       [["Test"] + [m(y_test, y_pred) for m in metrics]]
lr_results = pd.DataFrame(
    data=data,
    columns=["train / test", "accuracy", "precision", "recall"]
)

lr_results

Unnamed: 0,train / test,accuracy,precision,recall
0,Train,0.899889,0.677419,0.251799
1,Test,0.900552,0.634615,0.317308
