In [None]:
from mlxtend.plotting import plot_decision_regions
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import missingno as msno
import matplotlib.pyplot as plt
from IPython.display import Image, display
import seaborn as sns
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
sns.set()
import warnings
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
warnings.filterwarnings('ignore')
%matplotlib inline
#plt.style.use('ggplot')
#ggplot is R based visualisation package that provides better graphics with higher level of abstraction

In [None]:
#Loading the dataset
## basic statistic details about the data (note only numerical columns would be displayed here unless parameter include="all")


diabetes_data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

#Print the first 5 rows of the dataframe.
diabetes_data.head()

In [None]:
diabetes_data.describe()

In [None]:

#It is better to replace zeros with nan since after that counting
# them would be easier and zeros need to be replaced with suitable values#cleaning the data set
diabetes_data_copy = diabetes_data.copy(deep = True)
diabetes_data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

## showing the count of Nans


print(diabetes_data_copy.isnull().sum())
#As we can see that there are null values in multiple columns
#To fill these Nan values the data distribution needs to be understood 


In [None]:
diabetes_data.hist(figsize = (10,10))

In [None]:
## Aiming to impute nan values for the columns in accordance with their distribution 
## basicaly this is recursive cell to replace the nan's and zeros for other columns to finalize the cleaned dataset

diabetes_data_copy['Glucose'].fillna(diabetes_data_copy['Glucose'].mean(), inplace = True)
diabetes_data_copy['BloodPressure'].fillna(diabetes_data_copy['BloodPressure'].mean(), inplace = True)
diabetes_data_copy['SkinThickness'].fillna(diabetes_data_copy['SkinThickness'].median(), inplace = True)
diabetes_data_copy['Insulin'].fillna(diabetes_data_copy['Insulin'].median(), inplace = True)
diabetes_data_copy['BMI'].fillna(diabetes_data_copy['BMI'].median(), inplace = True)
diabetes_data_copy.isna().sum()


In [None]:
# here after cleansing the dataset we have plots 
paftercleaning = diabetes_data_copy.hist(figsize = (10,10))


In [None]:
df=pd.DataFrame(diabetes_data)
dfupdated=pd.DataFrame(diabetes_data_copy)
print(diabetes_data.shape,diabetes_data_copy.shape)

In [None]:
## data type analysis
#plt.figure(figsize=(5,5))
#sns.set(font_scale=2)
sns.countplot(y=diabetes_data.dtypes.map(str) ,data=diabetes_data)
plt.xlabel("count of each data type")
plt.ylabel("data types")


plt.show()
diabetes_data.info(verbose=True)

In [None]:
## null count analysis

p=msno.bar(diabetes_data)

In [None]:
## checking the balance of the data by plotting the count of outcomes by their value
## here we get over view for the data of wether what majority is classified for 
diabetes_data.Outcome.value_counts().plot(kind="pie")
print("The below graph shows that the data is biased towards datapoints having outcome value as 0 where it means that diabetes was not present actually. \nThe number of non-diabetics is almost twice the number of diabetic patients\n",       diabetes_data.Outcome.value_counts())

In [None]:
#Scatter matrix of uncleaned data

p=scatter_matrix(diabetes_data,figsize=(25, 25)) 

#This pairs plot builds onto two basic figures, the histogram and the scatter plot. The histogram on the diagonal allows us to see the distribution of a single variable while the scatter plots on the upper and lower triangles show the relationship (or lack thereof)between two variables.

In [None]:
#Pair plot for clean data 

p=sns.pairplot(diabetes_data_copy, hue = 'Outcome')

In [None]:
#Pearson's Correlation Coefficient: helps you find out the relationship between two quantities. It gives you the measure of the strength of association between two variables. 

#The value of Pearson's Correlation Coefficient can be between -1 to +1. 1 means that they are highly correlated and 0 means no correlation.

# A heat map is a two-dimensional representation of information with the help of colors. Heat maps can help the user visualize simple or complex information.

#Heatmap for unclean data
plt.figure(figsize=(18,16))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(diabetes_data.corr(), annot=True,cmap ='RdYlGn')  # seaborn has very simple solution for heatmap

In [None]:
#Heatmap for clean data 

plt.figure(figsize=(18,16))  
p=sns.heatmap(diabetes_data_copy.corr(), annot=True,cmap ='RdYlGn')


In [None]:
# scaling the data data Z is rescaled such that μ = 0 and 𝛔 = 1, and is done through this formula: 𝙯= ( 𝙭(𝙞)-μ ) / 𝛔 
# Standardization refers to shifting the distribution of each attribute to have a mean of zero and a standard deviation of one (unit variance).
#It is useful to standardize attributes for a model that relies on the distribution of attributes such as Gaussian processes.

from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(diabetes_data_copy.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

# Data rescaling is an important part of data preparation before applying machine learning algorithms.

X.head() 

In [None]:
#X = diabetes_data.drop("Outcome",axis = 1)
y = diabetes_data_copy.Outcome  # assigning the label column"

In [None]:
# Why Scaling the data for KNN?
# it is always advisable to bring all the features to the same scale for applying distance based algorithms like KNN.
# We can imagine how the feature with greater range with overshadow or dimenish the smaller feature completely and this will impact the performance of all distance     based model as it will give higher weightage to variables which have higher magnitude

In [None]:
# Test Train Split and Cross Validation methods

# Train Test Split : To have unknown datapoints to test the data rather than testing with the same points with which the model was trained. This helps capture the      model performance much better.

# Cross Validation: When model is split into training and testing it can be possible that specific type of data point may go entirely into either training or testing   portion. This would lead the model to perform poorly. Hence over-fitting and underfitting problems can be well avoided with cross validation techniques

# About Stratify : Stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to    parameter stratify.

# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your       random split has 25% of 0's and 75% of 1's.


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
#print("X_train : \n\n{} \nX_test : \n\n{} \nY_train : \n\n{} \nY_test : \n\n{}".format(X_train,X_test,y_train,y_test))

In [None]:
models = []
models.append(('Logistic Regression              ', LogisticRegression()))
models.append(('Linear Discriminant Analysis     ', LinearDiscriminantAnalysis()))
models.append(('Random Forest Classifier         ', RandomForestClassifier()))
models.append(('KNeighbors Classifier            ', KNeighborsClassifier()))
models.append(('Decision Tree Classifier         ', DecisionTreeClassifier()))
models.append(('Gaussian Naive Bayes             ', GaussianNB()))
models.append(('Support vector machine Classifier', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
    pipeline.fit(X_train, y_train)
    cv_results = model_selection.cross_val_score(pipeline, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: - (mean.)=%f - (stdev.)=(%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg,"\n")
    
# boxplot algorithm comparison
fig = plt.figure()

fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)

ax.set_xticklabels(names)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix

pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)

print(f"Accuracy Score : {round(accuracy_score(y_test, prediction) * 100, 2)}%")