---
[Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology.
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.
  
  ---

In [None]:
!pip install wget

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import wget

# скачаем и распакуем содержимое архива
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
filename = wget.download(url)

with zipfile.ZipFile(filename, 'r') as outer_zip_ref:
    outer_zip_ref.extractall()
    for inner_archive_filename in outer_zip_ref.namelist():
        with zipfile.ZipFile(inner_archive_filename, 'r') as inner_zip_ref:
            inner_zip_ref.extractall()

In [6]:
df = pd.read_csv('/content/bank-additional/bank-additional-full.csv', sep = ';') # наиболее полный датасет с 41188 позиций и 21 параметром
df2 = pd.read_csv('/content/bank-additional/bank-additional.csv', sep = ';') # 10% of df

df3 = pd.read_csv('bank-full.csv', sep = ';') # 45211 позиций и 17 параметров
df4 = pd.read_csv('bank.csv', sep = ';') # 10% of df3

In [10]:
df.sample(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
10510,47,entrepreneur,married,professional.course,unknown,yes,no,telephone,jun,tue,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1,no
6558,39,blue-collar,married,basic.6y,unknown,no,no,telephone,may,wed,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
11999,42,services,single,unknown,no,yes,no,telephone,jun,thu,...,2,999,0,nonexistent,1.4,94.465,-41.8,4.955,5228.1,no
35706,46,admin.,single,university.degree,no,yes,yes,cellular,may,mon,...,5,999,1,failure,-1.8,92.893,-46.2,1.244,5099.1,no
15989,37,management,married,unknown,no,yes,no,cellular,jul,tue,...,9,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,no


# bank client data:
1 - age (numeric)\
2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")\
3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)\
4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")\
5 - default: has credit in default? (categorical: "no","yes","unknown")\
6 - housing: has housing loan? (categorical: "no","yes","unknown")\
7 - loan: has personal loan? (categorical: "no","yes","unknown")
# related with the last contact of the current campaign:
8 - contact: contact communication type (categorical: "cellular","telephone") \
9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")\
10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")\
11 - duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)\
13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)\
14 - previous: number of contacts performed before this campaign and for this client (numeric)\
15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
# social and economic context attributes
16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)\
17 - cons.price.idx: consumer price index - monthly indicator (numeric)     
18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
19 - euribor3m: euribor 3 month rate - daily indicator (numeric)\
20 - nr.employed: number of employees - quarterly indicator (numeric)

In [16]:
# избавимся от возможных дубликатов
df = df.drop_duplicates()

#EDA

In [27]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41176 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.idx   41176 non-null 

In [28]:
print(df['y'].value_counts())

no     36537
yes     4639
Name: y, dtype: int64


##Рассмотрим основное распределение параметров

In [46]:
import plotly.express as px

fig = px.histogram(df, x='age', title='Age Distribution')
fig.update_traces(marker_color='blue', marker_line_color='black', marker_line_width=1)
fig.update_xaxes(title_text='Age')
fig.update_yaxes(title_text='Count')
fig.show()

<Figure size 1000x500 with 0 Axes>

In [49]:
fig = px.bar(df['job'].value_counts().reset_index(), x='index', y='job', color='job', title='Job Distribution')
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.update_xaxes(title_text='Job')
fig.update_yaxes(title_text='Count')
fig.update_layout(xaxis_categoryorder='total descending')
fig.show()

In [51]:
fig = px.bar(df['marital'].value_counts().reset_index(), x='index', y='marital', color='marital', title='Marital Status Distribution')
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.update_xaxes(title_text='Marital Status')
fig.update_yaxes(title_text='Count')
fig.update_layout(xaxis_categoryorder='total descending')
fig.show()

In [54]:
fig = px.bar(df['education'].value_counts().reset_index(), x='index', y='education', color='education', title='Education Level Distribution')
fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.update_xaxes(title_text='Education Level')
fig.update_yaxes(title_text='Count')
fig.update_layout(xaxis_categoryorder='total descending')
fig.show()

##Рассмотрим корреляцию числовых параметров

In [58]:
corr_df = df[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']]

fig = px.imshow(corr_df.corr(), title='Correlation Heatmap')
fig.update_xaxes(tickangle=-45)
fig.show()

Наиболее зависимыми вышли emp.var.rate по отношению к euribo3m и nr.employed а так же euribo3m c nr.employed

In [24]:
from sklearn.model_selection import train_test_split

In [64]:
# преобрызуем и разделим выборку
train_size = 0.7

df['y'] = df['y'].map({"no":0, "yes":1})
df_knn = df[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']]

X = df_knn.drop('y', axis=1)
y = df_knn['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)

In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy:', accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))

Accuracy: 0.9034242694082409 0.09657573059175908 0.03593263558938964


In [69]:
import plotly.graph_objs as go

accuracy = []
for k in range(1, 30):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))

[0.8869100623330365, 0.90002428559864, 0.8997004776167733, 0.9031004614263741, 0.9034242694082409, 0.9053671172994414, 0.9056099732858415, 0.9084432931271755, 0.9080385331498422, 0.9075528211770421, 0.9078766291589088, 0.9078766291589088, 0.9084432931271755, 0.9085242451226423, 0.9095766210637092, 0.909900429045576, 0.9107099490002428, 0.9110337569821096, 0.9101432850319761, 0.9111147089775763, 0.9117623249413098, 0.9108718529911762, 0.9117623249413098, 0.9119242289322432, 0.9106289970047762, 0.9113575649639764, 0.9116004209503764, 0.911438516959443, 0.9111956609730429]


ValueError: ignored

In [71]:
trace = go.Scatter(x=[i for i in range(1,30)], y=accuracy, mode='lines', marker=dict(color='blue'))
layout = go.Layout(title='Accuracy vs Range', xaxis=dict(title='Range'), yaxis=dict(title='Accuracy'))
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [72]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': range(1, 30)}
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['n_neighbors']
best_accuracy = grid_search.best_score_

print(f"Best k: {best_k}")
print(f"Best accuracy: {best_accuracy}")


best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
accuracy_on_test = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set with best k: {accuracy_on_test}")

Best k: 23
Best accuracy: 0.9100371116473154
Accuracy on test set with best k: 0.9117623249413098
