In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

  return f(*args, **kwds)


In [2]:
df = pd.read_csv('diabetes.csv')

## removing outliers using standard deviation
 - Standard deviation is a metric of variance i.e. how much the individual data points are spread out from the mean.
 - less reliable than IQR because the mean and standard deviation are impacted by the outliers
 - data must follow a Gaussian or normal distribution

## let's remove outliers in the insulin variable
 - remove points that are above (Mean + 2 * SD) and any points below (Mean - 2 * SD)

In [4]:
mean = np.mean(df['Insulin'])
sd = np.std(df['Insulin'])

In [5]:
df2 = df[(df['Insulin'] > mean - 2 * sd) & (df['Insulin'] < mean + 2 * sd)]

## compare models

## baseline logistic regression model

In [9]:
X, y = df.loc[:, df.columns != 'Outcome'], df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Accuracy of logistic regression classifier on test set: 0.75


Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,90,9
1,29,26


## logistic regression model after removing outliers

In [13]:
X, y = df2.loc[:, df2.columns != 'Outcome'], df2['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Accuracy of logistic regression classifier on test set: 0.80


Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,91,9
1,21,26


## removing outliers using the median absolute deviation
- Robust Z-Score method
- source: https://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa

In [15]:
def mad_based_outlier(points, thresh=3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

In [18]:
df2 = df[mad_based_outlier(df['Insulin'])]

In [19]:
df2.shape

(101, 9)

In [20]:
df.shape

(768, 9)

In [26]:
X, y = df2.loc[:, df2.columns != 'Outcome'], df2['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Accuracy of logistic regression classifier on test set: 0.76


Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6,1
1,4,10
