### The data has been obtained from the National Institute of Diabetes and Digestive and Kidney Diseases in India.

# 1. Importing relevant libraries and dataset

In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_selection import f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  
from sklearn.naive_bayes import GaussianNB




In [None]:
dataset = pd.read_csv("diabetes.csv")

In [None]:
dataset.head()

# 2. Data Exploration

In [None]:
dataset.shape

There are 9 columns and 768 rows

In [None]:
dataset.iloc[:,:-1].info()

There are no independent Categorical variables. Hence, none need to be one hot encoded, and are apt in their current float/integer form

In [None]:
dataset.describe()

In [None]:
dataset.isna().sum()

### There are no null values. Hence, we do not require filling or dropping of any rows.

In [None]:
discrete_feature = ['Pregnancies','Age','Outcome']

print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
overall_diabetes_rate = dataset['Outcome'].mean()
overall_diabetes_rate

In [None]:
group_by_pregnancy = dataset.groupby("Pregnancies").agg({'Outcome': np.mean})
group_by_pregnancy

In [None]:
axes = plt.axes()
axes.axhline(overall_diabetes_rate, color = 'red')
group_by_pregnancy.plot(marker='x', legend= False, ax = axes)
axes.set_ylabel('Proportion of  Diabetic people')
axes.legend(['Enite dataset', 'Numbers of pregancnies'])

In [None]:
feature = 'Age'

fig, (ax1,ax2) = plt.subplots(1,2, figsize = (20,5),constrained_layout=True)
bin_x = range(25,80,2)

ax1.hist(dataset[feature],bins=bin_x,rwidth=0.9)
ax1.set_xticks(range(25,80,2))
ax1.set_xlabel('Age',fontsize=15)
ax1.set_ylabel('Count',fontsize=15)
ax1.set_title('Age Distribution',fontsize=20)

ax2.hist(dataset[dataset['Outcome']==1][feature], label = 'Positive',bins=bin_x,rwidth=0.9)
ax2.hist(dataset[dataset['Outcome']==0][feature], label = 'Negative',bins=bin_x,rwidth=0.5)
ax2.legend()
ax2.set_xticks(range(25,80,2))
ax2.set_xlabel('Age',fontsize=15)
ax2.set_ylabel('Count',fontsize=15)
ax2.set_title('Diabetes: Positive vs Negative',fontsize=20)

plt.show()

#### Groups of people with higher number of pregnancies tend to have a higher diabetes rate.

#### Younger cohorts have a lower diabetes rate as oppoed to older people

In [None]:
continuous_feature=[feature for feature in dataset.columns if feature not in discrete_feature]
print("Continuous feature Count {}".format(len(continuous_feature)))

In [None]:
for feature in continuous_feature:
    dataset[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

#### Some instances are observed to have Glucose/ Skin Thickness/ BMI/ Blood Pressure equal to 0. This is biologically impossible for a living human. The simple answer would be to drop the rows with any of these features equal to 0. However, this can lead to a major loss of information or bias. As such, we must devise a way to deal with the missing values whilst abstaining from dropping the respective rows. We also avoid removing columns with a high number of null values as they are all crucial to our analysis


#### I advoacte for imputation of data rather than removal of data

In [None]:
columns_incorrect = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]

def replace_(x):
    if x == 0:
        return np.nan
    return x

for column in columns_incorrect:
    dataset[column] = dataset[column].map(replace_).values

In [None]:
dataset.isna().sum()

In [None]:
dataset.isnull().sum(axis=1).value_counts()
#Number of rows by number of missing values

#### The  𝜒2 could be used to test goodness of fit, homogenity test and independence test. The latter will be used in this case to figure out if the missigness of data in the Height column is dependent (or not) on the other variables (columns).

#### The test starts by stating a first hypothesis called (the null hypothesis) and calculates a measure of closness between the observed data and the expected data (in the case where the null hypothesis is satisfied).

#### The null hypothesis in this case is the following: There is no association between the missingness in the Height column and the dependent variable

#### Alpha = 0.05

In [None]:
def check_mcar(data, incorrect_columns, dependent_categorical_variable):
    """ To check whether the missingness of data is dependent on the outcome variable. A significant relationship can bias our results."""
    new_columns = []
    
    for column in incorrect_columns:
        data[column+"_missing"]  = False
        data.loc[data[data[column].isnull()].index, column+"_missing"] = True
        new_columns.append(column+"_missing")
        
    for column in incorrect_columns:
        grouped_true =data[data[column+"_missing"]==True].groupby(dependent_categorical_variable)[column+"_missing"].count()
        grouped_false =data[data[column+"_missing"]==False].groupby(dependent_categorical_variable)[column+"_missing"].count()
        table = [[grouped_true[0], grouped_false[0]],[grouped_true[1],grouped_false[1]]]
        chi2, p, dof, ex = chi2_contingency(table, correction=True)
        print("The p-value of chi-square test between", column +" and "+dependent_categorical_variable,  "is equal to {}".format(p))


In [None]:
check_mcar(dataset, columns_incorrect, "Outcome")

In [None]:
dataset.shape

In [None]:
#Dropping columns to check missingness
print("Shape before dropping", dataset.shape)
for column in columns_incorrect:
    dataset.drop((column+"_missing"), inplace=True,axis=1)
print("Shape after dropping", dataset.shape)

In [None]:
dataset.head()

#### Since all p-values are > than 0.05, we do not reject the null hypothesis, and thus there is no relationship between the missingness of any data and whether the person is diabetic. Although it is hard to tell with certitude whether the data is missing at random or not, the above test tells us that there is no evidence to tell that the data is not missing at random baleful for the outcome of interest.

#### Hence, safely assuming that the data is missing at random, we can imputate the missing values using the mean and median.

#### Since Glucose and BloodPressure follow a normal distribution and are without outliers, missing data for these variables can be imputated using the mean. Since the other three columns – SkinThickness, BMI and Insulin have presence of outliers, the missing values will be imputated with the mean for these.

In [None]:
imputer1 = SimpleImputer(strategy="mean")
imputer2 = SimpleImputer(strategy="mean")
imputer3 = SimpleImputer(strategy="median")
imputer4 = SimpleImputer(strategy="median")
imputer5 = SimpleImputer(strategy="median")


dataset["Glucose"] = imputer1.fit_transform(dataset["Glucose"].values.reshape(-1, 1)).copy()
dataset["BloodPressure"] = imputer2.fit_transform(dataset["BloodPressure"].values.reshape(-1, 1)).copy()
dataset["SkinThickness"] = imputer3.fit_transform(dataset["SkinThickness"].values.reshape(-1, 1)).copy()
dataset["Insulin"] = imputer4.fit_transform(dataset["Insulin"].values.reshape(-1, 1)).copy()
dataset["BMI"] = imputer5.fit_transform(dataset["BMI"].values.reshape(-1, 1)).copy()


In [None]:
dataset.isna().sum()

In [None]:
for feature in continuous_feature:
    dataset[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

In [None]:
sns.pairplot(dataset, hue="Outcome")

#### Skin thickness and BMI seem to be correlated. No other pair of independent variables seems to be highly correlated

In [None]:
fig=plt.figure(figsize=(10,7))
backgroundcolor='#f6f5f7'
fig.patch.set_facecolor(backgroundcolor)
sns.heatmap(data=dataset.corr(),annot=True,cmap='OrRd')

In [None]:
def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(dataset, 20))

#### Using a threshold of 0.7, we conclude that there is no multicollinearity between any independent variables. As such, we do not drop any features to resolve this. I will get back to this later during feature selection.

In [None]:
sns.set(font_scale=1)
LABELS = ["Negative", "Positive"]
count_classes = pd.value_counts(dataset['Outcome'], sort = True)
count_classes.plot(kind = 'pie', rot=0)
plt.title("Visualization of Value of Label")
plt.xticks(range(2), LABELS)
plt.ylabel("Frequency")

#### There are a lot of instances with a positive outcome for Diabetes (Taking a threshold of 1:10). Hence, due to it not being rare, balancing the dataset is not necessary

In [None]:
dataset.groupby('Outcome').mean()

#### We notice that at an average people diabetes tend to have a higher number of pregnancies, higher glucose levels, higher blood pressure, thicker skin, a higher score on the insulin test, higher BMI levels, and a higher age.

### Train-test split

In [None]:
y = dataset['Outcome']
dataset.drop(columns=['Outcome'],inplace=True)
X= dataset


In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=54,stratify=y)

# 3. Scaling the features

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 4. Modelling

### F-test

In [None]:
features_response = dataset.columns.tolist()

In [None]:
[f_stat,f_p_value] = f_classif(X_train, y_train)

In [None]:
f_test_df = pd.DataFrame({'Feature': features_response,'F-statistic': f_stat, 'p-value': f_p_value})
f_test_df.sort_values('p-value')


#### Glucose and BMI seem to be the most useful determiners for predicting diabetes. All predictors seem to be related with the response variable, and thus will be useful in our model 

## Logistic Regression

In [None]:
lr = LogisticRegression(random_state=42)

In [None]:
param_lr = dict()
param_lr['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_lr['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
param_lr['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1e+3]

In [None]:
lr_search = GridSearchCV(lr, param_lr, scoring='accuracy', n_jobs=-1, cv=5)
lr_search.fit(X_train, y_train)

In [None]:
lr_search.best_params_


In [None]:
lr = LogisticRegression(penalty = "none", solver= "newton-cg",C=1e-05,random_state=42)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)

In [None]:
accuracy

In [None]:
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

## K-nearest Neighbours

In [None]:
knn = KNeighborsClassifier()
param_knn = {'n_neighbors':np.arange(2, 50)}  
grid_knn = GridSearchCV(knn, param_grid=param_knn,scoring='accuracy', cv=5)

grid_knn.fit(X_train, y_train)



In [None]:
grid_knn.best_params_

In [None]:
knn = KNeighborsClassifier(n_neighbors= 13)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)

In [None]:
accuracy

In [None]:
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

## Decision Trees

In [None]:
dt = DecisionTreeClassifier()
param_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=param_dt, cv=5)
grid_dt.fit(X_train, y_train)

In [None]:
grid_dt.best_params_

In [None]:
dt = DecisionTreeClassifier(criterion= 'entropy', max_depth= 5, min_samples_leaf= 20)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy


In [None]:
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier(random_state=42)
param_rf = {'n_estimators':[100, 350, 500], 'min_samples_leaf':[2, 10, 30]}
grid_rf = GridSearchCV(rf, param_grid=param_rf, cv=5)
grid_rf.fit(X_train, y_train)

In [None]:
grid_rf.best_params_

In [None]:
rf = RandomForestClassifier(min_samples_leaf= 10, n_estimators= 100,random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

In [None]:
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

## SVM

In [None]:
svc = SVC(random_state=42)

In [None]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [None]:
grid_svc = GridSearchCV(svc, param_grid=param_grid, cv=5)
grid_svc.fit(X_train, y_train)

In [None]:
grid_svc.best_params_

In [None]:
svc = SVC(random_state=42,C= 100, gamma= 0.01, kernel= 'sigmoid')

In [None]:
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

In [None]:
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()

# SVM is the best model with an accuracy of 78%