In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import graphviz
import pydotplus
from sklearn import tree
from IPython.display import Image

%matplotlib inline


In [78]:
df = pd.read_csv("data/bank-additional.csv", sep=";")

## 只做枚举类型分析

In [77]:
columns = ['job' ,'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

In [54]:
df = df[columns]

In [55]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,blue-collar,married,basic.9y,no,yes,no,cellular,may,nonexistent,no
1,services,single,high.school,no,no,no,telephone,may,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,jun,nonexistent,no
3,services,married,basic.9y,no,unknown,unknown,telephone,jun,nonexistent,no
4,admin.,married,university.degree,no,yes,no,cellular,nov,nonexistent,no


In [56]:
df.shape

(4119, 10)

In [57]:
labelencoder = LabelEncoder()
for col in df.columns:
    df[col] = labelencoder.fit_transform(df[col])
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,1,1,2,0,2,0,0,6,1,0
1,7,2,3,0,0,0,1,6,1,0
2,7,1,3,0,2,0,1,4,1,0
3,7,1,2,0,1,1,1,4,1,0
4,0,1,6,0,2,0,0,7,1,0


In [58]:
y = df['y']
X = df.drop('y', axis=1)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.8)

In [60]:
columns = X_train.columns

In [61]:
columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

## 标准化数据

In [62]:
ss_X = StandardScaler()
ss_y = StandardScaler()
x_train = ss_X.fit_transform(X_train)
x_test = ss_X.transform(X_test)

## 构建模型


In [63]:
model_tree = DecisionTreeClassifier()
model_tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [64]:
score = cross_val_score(model_tree, X, y, cv=5, scoring='accuracy')

In [65]:
score.mean()

0.858221666940137

In [69]:
model_tree.predict(x_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [71]:
X_test.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
3754,1,1,0,1,0,2,1,4,1
45,4,2,6,1,0,0,0,6,1
2774,10,2,6,0,0,0,1,4,1
1170,0,2,3,1,2,0,0,3,1
4042,1,2,2,0,1,1,0,7,1


In [81]:
df.iloc[3754]

job          blue-collar
marital          married
education       basic.4y
default          unknown
housing               no
loan                 yes
contact        telephone
month                jun
poutcome     nonexistent
y                     no
Name: 3754, dtype: object

In [79]:
df = df[columns]

In [80]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,blue-collar,married,basic.9y,no,yes,no,cellular,may,nonexistent,no
1,services,single,high.school,no,no,no,telephone,may,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,jun,nonexistent,no
3,services,married,basic.9y,no,unknown,unknown,telephone,jun,nonexistent,no
4,admin.,married,university.degree,no,yes,no,cellular,nov,nonexistent,no


## 模型评估

In [82]:
y_pred = model_tree.predict(x_test)

In [83]:
from sklearn.metrics import confusion_matrix

In [84]:
confusion_matrix = confusion_matrix(y_test, y_pred)

In [85]:
confusion_matrix

array([[703,  45],
       [ 63,  13]])

In [86]:
from sklearn.metrics import roc_auc_score, roc_curve

In [87]:
decition_roc_auc = roc_auc_score(y_test, model_tree.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, model_tree.predict_proba(x_test)[:,1])


## 绘图

In [24]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,1,1,2,0,2,0,0,6,1,0
1,7,2,3,0,0,0,1,6,1,0
2,7,1,3,0,2,0,1,4,1,0
3,7,1,2,0,1,1,1,4,1,0
4,0,1,6,0,2,0,0,7,1,0


In [25]:
df.shape

(4119, 10)

In [29]:
# 可视化树图
columns = ['job' ,'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
data_ = pd.read_csv("data/bank-additional.csv",sep=";")
data_ = data_[columns]
data_feature_name = data_.columns[1:]

In [30]:
data_.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,blue-collar,married,basic.9y,no,yes,no,cellular,may,nonexistent,no
1,services,single,high.school,no,no,no,telephone,may,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,jun,nonexistent,no
3,services,married,basic.9y,no,unknown,unknown,telephone,jun,nonexistent,no
4,admin.,married,university.degree,no,yes,no,cellular,nov,nonexistent,no


In [31]:
data_target_name = np.unique(data_["y"])

In [32]:
data_target_name

array(['no', 'yes'], dtype=object)

In [35]:
dot_tree = tree.export_graphviz(model_tree,out_file=None,feature_names=data_feature_name,class_names=data_target_name,filled=True, rounded=True,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_tree)
img = Image(graph.create_png())
graph.write_png("out.png")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.986928 to fit

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.986928 to fit



True

In [89]:
#5.将生成的决策树保存
with open("jueceshu.dot", 'w') as f:
    f = tree.export_graphviz(model_tree,out_file=f,feature_names=data_feature_name,class_names=data_target_name,filled=True, rounded=True,special_characters=True)


## 数据导出

In [36]:
X.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,1,1,2,0,2,0,0,6,1
1,7,2,3,0,0,0,1,6,1
2,7,1,3,0,2,0,1,4,1
3,7,1,2,0,1,1,1,4,1
4,0,1,6,0,2,0,0,7,1


In [37]:
X['y'] = y

In [38]:
X.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,1,1,2,0,2,0,0,6,1,0
1,7,2,3,0,0,0,1,6,1,0
2,7,1,3,0,2,0,1,4,1,0
3,7,1,2,0,1,1,1,4,1,0
4,0,1,6,0,2,0,0,7,1,0


In [40]:
X.to_csv("数字化之后的数据集.csv", index=None)

In [41]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,1,1,2,0,2,0,0,6,1,0
1,7,2,3,0,0,0,1,6,1,0
2,7,1,3,0,2,0,1,4,1,0
3,7,1,2,0,1,1,1,4,1,0
4,0,1,6,0,2,0,0,7,1,0


In [42]:
X['poutcome'].unique()

array([1, 0, 2])


## 绘制分析图

In [90]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,blue-collar,married,basic.9y,no,yes,no,cellular,may,nonexistent,no
1,services,single,high.school,no,no,no,telephone,may,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,jun,nonexistent,no
3,services,married,basic.9y,no,unknown,unknown,telephone,jun,nonexistent,no
4,admin.,married,university.degree,no,yes,no,cellular,nov,nonexistent,no


In [93]:
X.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,1,1,2,0,2,0,0,6,1
1,7,2,3,0,0,0,1,6,1
2,7,1,3,0,2,0,1,4,1
3,7,1,2,0,1,1,1,4,1
4,0,1,6,0,2,0,0,7,1


In [96]:
XX = X.copy()

In [98]:
XX['y'] = y

In [101]:
df['ok'] = y

In [103]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y,ok
0,blue-collar,married,basic.9y,no,yes,no,cellular,may,nonexistent,no,0
1,services,single,high.school,no,no,no,telephone,may,nonexistent,no,0
2,services,married,high.school,no,yes,no,telephone,jun,nonexistent,no,0
3,services,married,basic.9y,no,unknown,unknown,telephone,jun,nonexistent,no,0
4,admin.,married,university.degree,no,yes,no,cellular,nov,nonexistent,no,0


In [108]:
aa = df.groupby(['month'])

In [110]:
aa['y'].count()

month
apr     215
aug     636
dec      22
jul     711
jun     530
mar      48
may    1378
nov     446
oct      69
sep      64
Name: y, dtype: int64