
The Model is concerned with the prediction of Diabetes in the Paitents

## Import Libraries and Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/Users/priyankac/Downloads/diabetes.csv')
data.head()

In [None]:
data.shape

# There are 768 rows and 9 columns in the data

In [None]:
data.info()

# There are 9 numerical columns in the data 

In [None]:
data.describe(percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.99])

## Correlation Check

In [None]:
correlations = data.corr()
correlations

In [None]:
#plot correlation heatmap
plt.figure(figsize=(10,10))
sns.heatmap(round(data.corr(),2),fmt='0.2f', annot = True, cmap = 'YlGnBu')
plt.show()

## Visualizing the data for any Relations

In [None]:
def visualise(data):
    fig, ax = plt.subplots()
    ax.scatter(data.iloc[:,1].values, data.iloc[:,5].values)
    ax.set_title('Highly Correlated Features')
    ax.set_xlabel('Plasma glucose concentration')
    ax.set_ylabel('Body mass index')

visualise(data)

## Replacing the Zeros with Null values

In [None]:
data[['Glucose','BMI']] = data[['Glucose','BMI']].replace(0, np.NaN)
data.dropna(inplace=True)

In [None]:
visualise(data)

## Feature Selection

In [None]:
X = data[['Glucose','BMI','Pregnancies','BloodPressure','SkinThickness','Insulin',
          'DiabetesPedigreeFunction','Age']].values
y = data[['Outcome']].values

In [None]:
X.shape

In [None]:
y.mean()

## Standardization & Scaling of Features

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
# Checking to see the mean of 0 and std of 1 after standardization
mean = np.mean(X, axis=0)
print('Mean: (%d, %d)' % (mean[0], mean[1]))
standard_deviation = np.std(X, axis=0)
print('Standard deviation: (%d, %d)' % (standard_deviation[0], standard_deviation[1]))

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [None]:
print('Shape of Training Data ',X_train.shape)
print('Shape of Testing Data', X_test.shape)
print('Response rate in Training Data', y_train.mean())
print('Response rate in Testing Data', y_test.mean())
# mean of training and testing data are almost same indicating a good representation of the data

## Logistic Regression Model

In [None]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X_train,y_train)

## Predictions

In [None]:
y_pred=logreg.predict(X_test)

In [None]:
y_pred

## Performance & Accuracy

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("f1-score:",metrics.f1_score(y_test, y_pred))

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

## Conclusion

Although the accuracy of the model is 72% and the AUC is 80%, the f1 score is not great at 58%.