In [26]:
import pandas as pd  
import numpy as np 
from collections import Counter
import patsy 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# 加载数据

In [27]:
data = pd.read_csv('bank.csv', sep = ';')
columns = ['job', 'marital','education', 'default', 'housing', 'loan','contact','poutcome','response']

for column in columns:
    counter = Counter(data[column])
    print(counter)
    


Counter({'management': 969, 'blue-collar': 946, 'technician': 768, 'admin.': 478, 'services': 417, 'retired': 230, 'self-employed': 183, 'entrepreneur': 168, 'unemployed': 128, 'housemaid': 112, 'student': 84, 'unknown': 38})
Counter({'married': 2797, 'single': 1196, 'divorced': 528})
Counter({'secondary': 2306, 'tertiary': 1350, 'primary': 678, 'unknown': 187})
Counter({'no': 4445, 'yes': 76})
Counter({'yes': 2559, 'no': 1962})
Counter({'no': 3830, 'yes': 691})
Counter({'cellular': 2896, 'unknown': 1324, 'telephone': 301})
Counter({'unknown': 3705, 'failure': 490, 'other': 197, 'success': 129})
Counter({'no': 4000, 'yes': 521})


# 数据处理

In [28]:

##job设置虚拟变量
job_to_jobtype = {'admin.':'White Collar',\
    'entrepreneur':'White Collar',\
    'management':'White Collar',\
    'self-employed':'White Collar',\
    'blue-collar':'Blue Collar',\
    'services':'Blue Collar',\
    'technician':'Blue Collar'}
data['jobtype'] = data['job'].map(job_to_jobtype)
data['jobtype'] = data['jobtype'].fillna('Other/Unknown')



##response
noyes_to_binary = {'no':0, 'yes':1}
data['response'] = data['response'].map(noyes_to_binary)
data['response'] = data['response'].fillna('No')





### 生成模型训练数据

In [29]:

bank_spec = 'response ~ age + jobtype + education + marital + default + balance + housing + loan'
filter = data['pdays'].map(lambda d: d == -1)

bankwork = pd.DataFrame(data[filter], columns = ['response','age','jobtype',\
    'education',  'marital', 'default', 'balance', 'housing', 'loan'])


y_train,x_train = patsy.dmatrices(bank_spec, bankwork,return_type = 'dataframe')

x_train.columns

Index(['Intercept', 'jobtype[T.Other/Unknown]', 'jobtype[T.White Collar]',
       'education[T.secondary]', 'education[T.tertiary]',
       'education[T.unknown]', 'marital[T.married]', 'marital[T.single]',
       'default[T.yes]', 'housing[T.yes]', 'loan[T.yes]', 'age', 'balance'],
      dtype='object')

# 逻辑回归模型进行训练

In [30]:
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 逻辑回归结果

In [31]:
print(log_reg.coef_)
print(log_reg.score(x_train,y_train))

[[-1.04636425e+00  5.58031699e-01  1.39057349e-01  2.89881285e-02
   2.25533953e-01 -3.56268264e-01 -5.89451153e-01 -1.15366260e-01
   1.94892399e-01 -4.09685577e-01 -6.41175431e-01  5.40429206e-03
   4.86112800e-06]]
0.9090418353576248


In [32]:
##混淆矩阵
y_predict = log_reg.predict(x_train)
confusion_matrix(y_train,y_predict)

array([[3368,    0],
       [ 337,    0]], dtype=int64)