In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [35]:
# Create classifiers
lr = LogisticRegression()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier(n_estimators=100)

In [36]:
pima = pd.read_csv("https://raw.githubusercontent.com/PyDataWorkshop/datasets/master/pima.csv")

In [37]:
pima.tail(5)

Unnamed: 0,Preg,Gluc,Dias,Tric,2hSer,BM1,Diab,Age,Diab.1
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [38]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Preg      768 non-null int64
Gluc      768 non-null int64
Dias      768 non-null int64
Tric      768 non-null int64
2hSer     768 non-null int64
BM1       768 non-null float64
Diab      768 non-null float64
Age       768 non-null int64
Diab.1    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [39]:
pima.shape

(768, 9)

In [40]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(pima, test_size=0.2)

In [41]:
train.head(3)

Unnamed: 0,Preg,Gluc,Dias,Tric,2hSer,BM1,Diab,Age,Diab.1
408,8,197,74,0,0,25.9,1.191,39,1
441,2,83,66,23,50,32.2,0.497,22,0
746,1,147,94,41,0,49.3,0.358,27,1


In [42]:
train.shape

(614, 9)

In [43]:
test.shape

(154, 9)

In [44]:
train.iloc[:,:8].head(5)

Unnamed: 0,Preg,Gluc,Dias,Tric,2hSer,BM1,Diab,Age
408,8,197,74,0,0,25.9,1.191,39
441,2,83,66,23,50,32.2,0.497,22
746,1,147,94,41,0,49.3,0.358,27
418,1,83,68,0,0,18.2,0.624,27
214,9,112,82,32,175,34.2,0.26,36


In [45]:
train_feat = train.iloc[:,:8]
train_targ = train["Diab.1"]

In [46]:
train_feat.shape

(614, 8)

In [47]:
type(train_targ)

pandas.core.series.Series

In [48]:
#train_targ.info()

type(train_targ)

pandas.core.series.Series

In [49]:
type(train[["Diab.1"]])

pandas.core.frame.DataFrame

In [91]:
train[["Diab.1"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 408 to 307
Data columns (total 1 columns):
Diab.1    614 non-null int64
dtypes: int64(1)
memory usage: 29.6 KB


### Set up Testing Data

In [92]:
test_feat = test.iloc[:,:8]
test_targ = test["Diab.1"]

### 1. Logistic Regression

In [142]:
lr.fit(train_feat, train_targ)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [144]:
lr.score(train_feat, train_targ)

0.76872964169381108

In [145]:
lr.score(test_feat, test_targ)

0.74675324675324672

In [147]:
## log-Odds / Logits / Odds Ratio

In [148]:
lr.coef_

array([[  1.39038596e-01,   2.97083673e-02,  -1.88504815e-02,
         -7.07639767e-04,  -3.17537218e-04,   4.87959450e-02,
          6.52255796e-01,   6.14822195e-03]])

In [149]:
np.transpose(lr.coef_)

array([[  1.39038596e-01],
       [  2.97083673e-02],
       [ -1.88504815e-02],
       [ -7.07639767e-04],
       [ -3.17537218e-04],
       [  4.87959450e-02],
       [  6.52255796e-01],
       [  6.14822195e-03]])

In [150]:
lr.predict(test_feat)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0])

In [151]:
preds = lr.predict(test_feat)

In [152]:
pd.crosstab(preds,test_targ)

Diab.1,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,23
1,16,22


In [154]:
from sklearn.metrics import confusion_matrix

In [156]:
confusion_matrix(lr.predict(train_feat),train_targ)

array([[345,  96],
       [ 46, 127]])

In [157]:
confusion_matrix(lr.predict(test_feat),test_targ)

array([[93, 23],
       [16, 22]])

### 2. Random Forest Classifier

In [164]:
rfc.fit(train_feat, train_targ)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [169]:
rfc.score(train_feat, train_targ)

1.0

In [170]:
rfc.score(test_feat, test_targ)

0.74675324675324672

In [109]:
rfc.predict(train_feat)[:10]

array([1, 0, 1, 0, 1, 0, 1, 1, 0, 0])

In [171]:
confusion_matrix(rfc.predict(train_feat),train_targ)

array([[391,   0],
       [  0, 223]])

In [172]:
confusion_matrix(rfc.predict(test_feat),test_targ)

array([[89, 19],
       [20, 26]])

### 3. Linear SVC Model

In [173]:
svc.fit(train_feat, train_targ)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [176]:
svc.score(train_feat, train_targ)

0.64169381107491852

In [177]:
svc.score(test_feat,test_targ)

0.72077922077922074

In [178]:
svc.coef_

array([[ 0.05006139,  0.00652587, -0.01092286, -0.0049177 ,  0.00081938,
         0.00947821,  0.2188775 , -0.00120588]])

In [179]:
np.transpose(svc.coef_)

array([[ 0.05006139],
       [ 0.00652587],
       [-0.01092286],
       [-0.0049177 ],
       [ 0.00081938],
       [ 0.00947821],
       [ 0.2188775 ],
       [-0.00120588]])

In [125]:
svc.predict(train_feat)

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [126]:
preds = svc.predict(train_feat)

In [127]:
pd.crosstab(preds,train_targ)

Diab.1,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,381,178
1,10,45


In [128]:
np.sum(np.diag(pd.crosstab(preds,train_targ)))

426

In [129]:
from sklearn.metrics import confusion_matrix

In [180]:
confusion_matrix(svc.predict(train_feat),train_targ)

array([[389, 218],
       [  2,   5]])

In [181]:
confusion_matrix(svc.predict(test_feat),test_targ)

array([[109,  43],
       [  0,   2]])