In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
train = pd.read_csv(r'data/train.csv', sep=';')
test = pd.read_csv(r'data/train.csv', sep=';')
train = pd.concat([train, test])

In [3]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
train.isnull().any()

age          False
job          False
marital      False
education    False
default      False
balance      False
housing      False
loan         False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

In [5]:
# EDA

In [6]:
numerical_cols = train.select_dtypes(include=['number']).columns.tolist()
print(numerical_cols)


['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [7]:
for col in numerical_cols:
    print(f'{col}\t\tmin: {train[col].min()}\t\tmax: {train[col].max()}')

age		min: 18		max: 95
balance		min: -8019		max: 102127
day		min: 1		max: 31
duration		min: 0		max: 4918
campaign		min: 1		max: 63
pdays		min: -1		max: 871
previous		min: 0		max: 275


In [8]:
non_numerical_cols = train.select_dtypes(exclude=['number']).columns.tolist()
print(non_numerical_cols)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']


In [9]:
lab_enc = LabelEncoder()
for col in non_numerical_cols:
    lab_enc.fit(train[col])
    print(col)
    print(lab_enc.transform(lab_enc.classes_))
    print(lab_enc.classes_)
    print()

job
[ 0  1  2  3  4  5  6  7  8  9 10 11]
['admin.' 'blue-collar' 'entrepreneur' 'housemaid' 'management' 'retired'
 'self-employed' 'services' 'student' 'technician' 'unemployed' 'unknown']

marital
[0 1 2]
['divorced' 'married' 'single']

education
[0 1 2 3]
['primary' 'secondary' 'tertiary' 'unknown']

default
[0 1]
['no' 'yes']

housing
[0 1]
['no' 'yes']

loan
[0 1]
['no' 'yes']

contact
[0 1 2]
['cellular' 'telephone' 'unknown']

month
[ 0  1  2  3  4  5  6  7  8  9 10 11]
['apr' 'aug' 'dec' 'feb' 'jan' 'jul' 'jun' 'mar' 'may' 'nov' 'oct' 'sep']

poutcome
[0 1 2 3]
['failure' 'other' 'success' 'unknown']

y
[0 1]
['no' 'yes']



In [10]:
for col in non_numerical_cols:
    train[col] = lab_enc.fit_transform(train[col])

In [11]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [12]:
X = train.drop(['y'], axis=1)
y = train['y']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=101)

In [16]:
def model(X_train, y_train):
  forest = RandomForestClassifier(n_estimators=10, random_state=0)
  forest.fit(X_train, y_train)
  print(f'Las: {forest.score(X_train, y_train)}')

  lreg = LogisticRegression(max_iter=15000)
  lreg.fit(X_train, y_train)
  print(f'Regresja logistyczna: {lreg.score(X_train, y_train)}')

  tree = DecisionTreeClassifier()
  tree.fit(X_train, y_train)
  print(f'Drzewa decyzyjne: {tree.score(X_train, y_train)}')

  return forest, lreg, tree

forest, lreg, tree = model(X_train, y_train)

Las: 0.9991029626807899
Regresja logistyczna: 0.8916428071124
Drzewa decyzyjne: 1.0


In [17]:
y1_predict = forest.predict(X_test)
print(f'Random Forest {accuracy_score(y_test, y1_predict)}')

y2_predict = lreg.predict(X_test)
print(f'Logistic Regresion {accuracy_score(y_test, y2_predict)}')

y3_predict = tree.predict(X_test)
print(f'Decision Tree {accuracy_score(y_test, y3_predict)}')

Random Forest 0.9826385049209333
Logistic Regresion 0.8858785801172178
Decision Tree 0.9865089019130819


In [18]:
print('Ocena modelu 1. Las')
print(classification_report(y_test, y1_predict))

Ocena modelu 1. Las
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7930
           1       0.98      0.88      0.93      1113

    accuracy                           0.98      9043
   macro avg       0.98      0.94      0.96      9043
weighted avg       0.98      0.98      0.98      9043



In [19]:
print('Ocena modelu 2. Regresja logistyczna')
print(classification_report(y_test, y2_predict))

Ocena modelu 2. Regresja logistyczna
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      7930
           1       0.61      0.21      0.31      1113

    accuracy                           0.89      9043
   macro avg       0.75      0.59      0.62      9043
weighted avg       0.86      0.89      0.86      9043



In [20]:
print('Ocena modelu 3. Drzewa decyzyjne')
print(classification_report(y_test, y3_predict))

Ocena modelu 3. Drzewa decyzyjne
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7930
           1       0.94      0.95      0.95      1113

    accuracy                           0.99      9043
   macro avg       0.97      0.97      0.97      9043
weighted avg       0.99      0.99      0.99      9043



In [22]:
filename = r'../ml_models/banking_model.h5'
pickle.dump(tree, open(filename, 'wb'))