In [1]:
import pandas as pd

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [3]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

In [4]:
pima = pd.read_csv(url, header=None, names=col_names)

In [6]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Select features 

In [7]:
feature_cols = ['pregnant', 'insulin', 'bmi', 'age']
X = pima[feature_cols]
y = pima['label']

In [8]:
from sklearn.cross_validation import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
logreg = LogisticRegression()

In [13]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
y_pred = logreg.predict(X_test)

In [15]:
from sklearn import metrics

In [16]:
print(metrics.accuracy_score(y_test, y_pred)) #True value first

0.692708333333


### Print NULL accuracy

In [17]:
y_test.value_counts()

0    130
1     62
Name: label, dtype: int64

In [18]:
y_test.mean()

0.32291666666666669

#### Universal way of printing NULL accuracy 

In [24]:
y_test.value_counts().head(1)/ len(y_test)

0    0.677083
Name: label, dtype: float64

In [29]:
confusion_mat = metrics.confusion_matrix(y_test, y_pred)

In [30]:
print(confusion_mat)

[[118  12]
 [ 47  15]]


In [31]:
TN = confusion_mat[0, 0]
TP = confusion_mat[1, 1]
FP = confusion_mat[0, 1]
FN = confusion_mat[1, 0]

print(TN, TP, FP, FN)

118 15 12 47


In [28]:
TN

118

# Metrics computed from a confusion matrix 

### Classification Accuracy 

In [32]:
print((TP+ TN)/ float(TP+ TN+ FP+ FN))
print(metrics.accuracy_score(y_test, y_pred))

0.692708333333
0.692708333333


### Classification error 

In [33]:
print((FP+ FN)/ float(TP+ TN+ FP+ FN))
print(1.0 - metrics.accuracy_score(y_test, y_pred))

0.307291666667
0.307291666667


### Sensitivity :When the actual value is positive how often the prediction is correct 

####  How sensitive is the classifier in detecting positive instances

In [34]:
print((TP)/ float(TP +FN))

0.241935483871


In [35]:
print(metrics.recall_score(y_test, y_pred))

0.241935483871


### Specificity when the actual value is negative how often is the prediction correct 

#### How specific or selective the classifier in predicting positive instances ????

In [36]:
print(TN/float(TN+FP))

0.907692307692


### False positive rate: When the actual value is negative how often is the prediction incorrect?

In [39]:
print(FP/float(FP+TN))

0.0923076923077


### Precision, when the prediction is positive how often is the prediction correct 

In [38]:
print(TP/float(TP+FP))

0.555555555556


In [41]:
print(metrics.precision_score(y_test, y_pred))

0.555555555556


# Adjusting the Classification accuracy

In [42]:
logreg.predict(X_test)[0:10]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [43]:
logreg.predict_proba(X_test)[0:10]

array([[ 0.63247571,  0.36752429],
       [ 0.71643656,  0.28356344],
       [ 0.71104114,  0.28895886],
       [ 0.5858938 ,  0.4141062 ],
       [ 0.84103973,  0.15896027],
       [ 0.82934844,  0.17065156],
       [ 0.50110974,  0.49889026],
       [ 0.48658459,  0.51341541],
       [ 0.72321388,  0.27678612],
       [ 0.32810562,  0.67189438]])