In [17]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import binarize
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import numpy as np
%matplotlib inline

In [87]:
colnames = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
df = pd.read_csv('Diabetes data.csv',names=colnames,skiprows=1)
print df.count()
df.head()

pregnant    768
glucose     768
bp          768
skin        768
insulin     768
bmi         768
pedigree    768
age         768
label       768
dtype: int64


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
label = df['label']
X, y = df.drop('label', axis=1), label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [22]:
#gboost
gradient_boost_eval = GradientBoostingClassifier(random_state=0)

params = {
    'learning_rate': [0.05, 0.1, 0.5],
    'max_features': [0.5, 1],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(gradient_boost_eval, params, cv=10, scoring='roc_auc',n_jobs=6)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_features': 0.5, 'learning_rate': 0.05, 'max_depth': 3}
0.825697572573


In [23]:
gradient_boost = GradientBoostingClassifier(
    learning_rate=0.05, max_depth=3, max_features=0.5, random_state=0)
cross_val_score(gradient_boost, X_train, y_train, cv=10, scoring='roc_auc').mean()

0.82558558558558559

In [24]:
gradient_boost_imp = GradientBoostingClassifier(
    learning_rate=0.05, max_depth=3, max_features=0.5, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

gradient_boost_imp.fit(X_train, y_train)
y_pred_prob = gradient_boost_imp.predict_proba(X_test)[:, 1]
print roc_auc_score(y_test, y_pred_prob)

0.852481389578


In [21]:
#svc
svc_eval = SVC(random_state=0)

params = {
    'kernel': ['rbf', 'linear','sigmoid'],
    'C': [1,5,10]}

grid_search = GridSearchCV(svc_eval, params, cv=5, scoring='roc_auc',verbose=1,n_jobs=6)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=6)]: Done  45 out of  45 | elapsed:  2.0min finished


{'kernel': 'linear', 'C': 10}
0.824917578729


In [16]:
svc_class_final = SVC(kernel='linear',C=10,probability=True)
print cross_val_score(svc_class_final, X_train, y_train, cv=5, scoring='roc_auc').mean()
svc_class_final.fit(X_train, y_train)
y_pred_prob = svc_class_final.predict_proba(X_test)[:, 1]
print roc_auc_score(y_test, y_pred_prob)

0.824919170041
0.86017369727


In [19]:
#log regression 
logregression = LogisticRegression(random_state=0)
params = {
    'penalty': ['l1','l2'],
    'C': [1,5,10]}
grid_search = GridSearchCV(logregression, params, cv=5, scoring='roc_auc',verbose=1,n_jobs=6)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
{'penalty': 'l1', 'C': 10}
0.823276483147


[Parallel(n_jobs=6)]: Done  30 out of  30 | elapsed:    0.2s finished


In [20]:
logregression_final = LogisticRegression(penalty='l1',C=10,random_state=0)
print cross_val_score(logregression_final, X_train, y_train, cv=5, scoring='roc_auc').mean()
logregression_final.fit(X_train, y_train)
y_pred_prob = logregression_final.predict_proba(X_test)[:, 1]
print roc_auc_score(y_test, y_pred_prob)

0.823277458643
0.859925558313


In [91]:
betas = logregression_final.coef_
intercept = logregression_final.intercept_
probs = 1.0/ (1 + np.exp(-(np.dot(X_test,np.transpose(betas))+ intercept)))
print roc_auc_score(y_test, probs)
#colnames.insert(0,'intercept')
pd.DataFrame(np.concatenate([betas.flatten(),intercept.flatten()]),columns=['params'],index=colnames[:-1])

0.859925558313


Unnamed: 0,params
intercept,0.086606
pregnant,0.032987
glucose,-0.011309
bp,0.005894
skin,-0.001015
insulin,0.087034
bmi,0.875562
pedigree,0.021815
age,-8.337658


In [90]:
colnames

['intercept',
 'pregnant',
 'glucose',
 'bp',
 'skin',
 'insulin',
 'bmi',
 'pedigree',
 'age',
 'label']