## 载入函数库

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

## 读取原始数据 

In [2]:
data = pd.read_csv('/Users/Jovial/Desktop/机器学习分享/data/rrd_data.csv')
data.iloc[:5,:]

Unnamed: 0,status,age,salary,gender,education,graduated.years,marriageStatus,hasChild,hasHouse,houseLoan,carLoan,hasCar,workYears,officeScale,months,borrowType,credit
0,1,28,35000,0,8,8,0,0,0,0,0,0,2,55,18,7,1
1,1,28,7500,0,8,10,0,0,0,0,0,0,4,500,6,7,1
2,1,27,3500,0,6,5,1,1,1,0,1,1,4,55,9,7,1
3,1,32,35000,0,3,18,1,1,0,0,0,0,2,55,6,7,1
4,1,33,35000,0,3,18,1,1,1,1,0,0,1,10,12,7,0


In [7]:
data.shape

(742, 17)

In [38]:
X_data = data.ix[:,1:]
Y_data = data.ix[:,0]
print(X_data.shape,Y_data.shape)

(742, 16) (742,)


## 划分训练集和测试集

In [25]:
X_train,X_test,Y_train,Y_test = train_test_split(X_data,Y_data,test_size=0.2,random_state=0)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(593, 16) (149, 16) (593,) (149,)


## 标准化处理 

In [29]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_train_std

array([[-1.14913159,  2.12894108,  2.6081855 , ..., -0.00613265,
         0.41941018, -0.38917388],
       [-0.99025189, -0.57311047, -0.38340831, ..., -0.73346467,
        -2.78286751, -0.38917388],
       [ 0.12190601,  1.17527583,  2.6081855 , ..., -0.00613265,
         0.41941018, -0.38917388],
       ..., 
       [-0.83137219, -0.8274212 ,  2.6081855 , ...,  2.90319543,
        -2.24915457, -0.38917388],
       [-0.35473309, -0.8274212 , -0.38340831, ..., -0.73346467,
        -2.24915457, -0.38917388],
       [ 1.07518421,  1.17527583, -0.38340831, ..., -0.73346467,
         0.41941018, -0.38917388]])

## 训练逻辑回归模型 

In [39]:
lr = LogisticRegression(penalty='l2',tol=0.0001,C=1,solver='liblinear',random_state=0)
lr.fit(X_train_std,Y_train)
print(lr.coef_)
print(lr.intercept_)

[[-0.03800108 -0.05589281 -0.25568543 -0.41524309 -0.05242314 -0.27681768
   0.30732284  0.051064   -0.3567775   0.21934692 -0.12005562 -0.01605826
  -0.26420597  0.58770857 -0.1225445  -0.40844449]]
[-2.10782841]


In [43]:
print(lr.predict_proba(X_test_std)[:10])
print(lr.predict(X_test_std)[:10])

[[ 0.72692534  0.27307466]
 [ 0.97783374  0.02216626]
 [ 0.18953303  0.81046697]
 [ 0.55655315  0.44344685]
 [ 0.87660071  0.12339929]
 [ 0.84304024  0.15695976]
 [ 0.93685386  0.06314614]
 [ 0.95105107  0.04894893]
 [ 0.82269751  0.17730249]
 [ 0.40525258  0.59474742]]
[0 0 1 0 0 0 0 0 0 1]


## 训练k近邻模型

In [53]:
knn = KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='auto',p=2)
knn.fit(X_train_std,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [54]:
print(knn.predict_proba(X_test_std)[:10])
print(knn.predict(X_test_std)[:10])

[[ 0.33333333  0.66666667]
 [ 1.          0.        ]
 [ 0.          1.        ]
 [ 0.66666667  0.33333333]
 [ 1.          0.        ]
 [ 1.          0.        ]
 [ 1.          0.        ]
 [ 1.          0.        ]
 [ 1.          0.        ]
 [ 0.66666667  0.33333333]]
[1 0 1 0 0 0 0 0 0 0]


## 训练朴素贝叶斯模型 

In [58]:
nb = GaussianNB(priors=None)
nb.fit(X_train_std,Y_train)

GaussianNB(priors=None)

In [59]:
print(nb.predict_proba(X_test_std)[:10])
print(nb.predict(X_test_std)[:10])

[[  4.62764718e-01   5.37235282e-01]
 [  9.99946945e-01   5.30547581e-05]
 [  6.36988625e-05   9.99936301e-01]
 [  1.06590877e-01   8.93409123e-01]
 [  8.81338052e-01   1.18661948e-01]
 [  8.50235522e-01   1.49764478e-01]
 [  9.97628413e-01   2.37158699e-03]
 [  9.06535547e-01   9.34644531e-02]
 [  8.04542584e-01   1.95457416e-01]
 [  8.80634599e-02   9.11936540e-01]]
[1 0 1 1 0 0 0 0 0 1]
