In [None]:
from mlxtend.plotting import plot_decision_regions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import os
#plt.style.use('ggplot')
#ggplot is R based visualisation package that provides better graphics with higher level of abstraction

## Basic Data Science and ML Pipeline

In [None]:
#Loading the dataset
diabetes_data = pd.read_csv('/kaggle/input/dataset-dental-panoramic/dataset_dental_panoramic .csv')

#Print the first 5 rows of the dataframe.
diabetes_data.head()

## Basic EDA and statistical analysis


In [None]:
## gives information about the data types,columns, null value counts, memory usage etc
## function reference : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.info.html
diabetes_data.info(verbose=True)

In [None]:
## basic statistic details about the data (note only numerical columns would be displayed here unless parameter include="all")
## for reference: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html#pandas.DataFrame.describe
diabetes_data.describe()

## Also see :
##to return columns of a specific dtype: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.select_dtypes.html#pandas.DataFrame.select_dtypes

In [None]:
diabetes_data.describe().T

#### It is better to replace zeros with nan since after that counting them would be easier and zeros need to be replaced with suitable values

In [None]:
diabetes_data_copy = diabetes_data.copy(deep = True)
diabetes_data_copy[['dissimilarity_0', 'dissimilarity_45', 'dissimilarity_90', 'dissimilarity_135', 
                 'correlation_0', 'correlation_45', 'correlation_90', 'correlation_135',
                 'homogeneity_0', 'homogeneity_45', 'homogeneity_90', 'homogeneity_135',
                 'contrast_0', 'contrast_45', 'contrast_90', 'contrast_135',
                 'ASM_0', 'ASM_45', 'ASM_90', 'ASM_135',
                 'energy_0', 'energy_45', 'energy_90', 'energy_135']] = diabetes_data_copy[['dissimilarity_0', 'dissimilarity_45', 'dissimilarity_90', 'dissimilarity_135', 
                 'correlation_0', 'correlation_45', 'correlation_90', 'correlation_135',
                 'homogeneity_0', 'homogeneity_45', 'homogeneity_90', 'homogeneity_135',
                 'contrast_0', 'contrast_45', 'contrast_90', 'contrast_135',
                 'ASM_0', 'ASM_45', 'ASM_90', 'ASM_135',
                 'energy_0', 'energy_45', 'energy_90', 'energy_135']].replace(0,np.NaN)

## showing the count of Nans
print(diabetes_data_copy.isnull().sum())

#### To fill these Nan values the data distribution needs to be understood

In [None]:
p = diabetes_data.hist(figsize = (20,20))

### Aiming to impute nan values for the columns in accordance with their distribution

In [None]:
diabetes_data_copy['dissimilarity_0'].fillna(diabetes_data_copy['dissimilarity_0'].mean(), inplace = True)
diabetes_data_copy['dissimilarity_45'].fillna(diabetes_data_copy['dissimilarity_45'].mean(), inplace = True)
diabetes_data_copy['dissimilarity_90'].fillna(diabetes_data_copy['dissimilarity_90'].median(), inplace = True)
diabetes_data_copy['dissimilarity_135'].fillna(diabetes_data_copy['dissimilarity_135'].median(), inplace = True)
diabetes_data_copy['correlation_0'].fillna(diabetes_data_copy['correlation_0'].mean(), inplace = True)
diabetes_data_copy['correlation_45'].fillna(diabetes_data_copy['correlation_45'].mean(), inplace = True)
diabetes_data_copy['correlation_90'].fillna(diabetes_data_copy['correlation_90'].median(), inplace = True)
diabetes_data_copy['correlation_135'].fillna(diabetes_data_copy['correlation_135'].median(), inplace = True)
diabetes_data_copy['homogeneity_0'].fillna(diabetes_data_copy['homogeneity_0'].mean(), inplace = True)
diabetes_data_copy['homogeneity_45'].fillna(diabetes_data_copy['homogeneity_45'].mean(), inplace = True)
diabetes_data_copy['homogeneity_90'].fillna(diabetes_data_copy['homogeneity_90'].median(), inplace = True)
diabetes_data_copy['homogeneity_135'].fillna(diabetes_data_copy['homogeneity_135'].median(), inplace = True)
diabetes_data_copy['contrast_0'].fillna(diabetes_data_copy['contrast_0'].mean(), inplace = True)
diabetes_data_copy['contrast_45'].fillna(diabetes_data_copy['contrast_45'].mean(), inplace = True)
diabetes_data_copy['contrast_90'].fillna(diabetes_data_copy['contrast_90'].median(), inplace = True)
diabetes_data_copy['contrast_135'].fillna(diabetes_data_copy['contrast_135'].median(), inplace = True)
diabetes_data_copy['ASM_0'].fillna(diabetes_data_copy['ASM_0'].mean(), inplace = True)
diabetes_data_copy['ASM_45'].fillna(diabetes_data_copy['ASM_45'].mean(), inplace = True)
diabetes_data_copy['ASM_90'].fillna(diabetes_data_copy['ASM_90'].median(), inplace = True)
diabetes_data_copy['ASM_135'].fillna(diabetes_data_copy['ASM_135'].median(), inplace = True)
diabetes_data_copy['energy_0'].fillna(diabetes_data_copy['energy_0'].mean(), inplace = True)
diabetes_data_copy['energy_45'].fillna(diabetes_data_copy['energy_45'].mean(), inplace = True)
diabetes_data_copy['energy_90'].fillna(diabetes_data_copy['energy_90'].median(), inplace = True)
diabetes_data_copy['energy_135'].fillna(diabetes_data_copy['energy_135'].median(), inplace = True)


## Plotting after Nan removal 

In [None]:
p = diabetes_data_copy.hist(figsize = (20,20))

In [None]:
## observing the shape of the data
diabetes_data.shape

In [None]:
## data type analysis
#plt.figure(figsize=(5,5))
#sns.set(font_scale=2)
sns.countplot(y=diabetes_data.dtypes ,data=diabetes_data)
plt.xlabel("count of each data type")
plt.ylabel("data types")
plt.show()

In [None]:
## null count analysis
import missingno as msno
p=msno.bar(diabetes_data)


In [None]:
## checking the balance of the data by plotting the count of outcomes by their value
color_wheel = {1: "#0392cf", 
               2: "#7bc043"}
colors = diabetes_data["outcome"].map(lambda x: color_wheel.get(x + 1))
print(diabetes_data.outcome.value_counts())
p=diabetes_data.outcome.value_counts().plot(kind="bar")


In [None]:
from pandas.tools.plotting import scatter_matrix
p=scatter_matrix(diabetes_data,figsize=(25, 25))

In [None]:
p=sns.pairplot(diabetes_data_copy, hue = 'outcome')

In [None]:
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(diabetes_data.corr(), annot=True,cmap ='RdYlGn')  # seaborn has very simple solution for heatmap

In [None]:
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(diabetes_data_copy.corr(), annot=True,cmap ='RdYlGn')  # seaborn has very simple solution for heatmap

## Scaling the data 
data Z is rescaled such that μ = 0 and 𝛔 = 1, and is done through this formula:
![](https://cdn-images-1.medium.com/max/800/0*PXGPVYIxyI_IEHP7.)


#### to learn more about scaling techniques
https://medium.com/@rrfd/standardize-or-normalize-examples-in-python-e3f174b65dfc
https://machinelearningmastery.com/rescaling-data-for-machine-learning-in-python-with-scikit-learn/

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(diabetes_data_copy.drop(["outcome"],axis = 1),),
        columns=['dissimilarity_0', 'dissimilarity_45', 'dissimilarity_90', 'dissimilarity_135', 
                 'correlation_0', 'correlation_45', 'correlation_90', 'correlation_135',
                 'homogeneity_0', 'homogeneity_45', 'homogeneity_90', 'homogeneity_135',
                 'contrast_0', 'contrast_45', 'contrast_90', 'contrast_135',
                 'ASM_0', 'ASM_45', 'ASM_90', 'ASM_135',
                 'energy_0', 'energy_45', 'energy_90', 'energy_135'])

In [None]:
X.head()

In [None]:
#X = diabetes_data.drop("outcome",axis = 1)
y = diabetes_data_copy.outcome

## Test Train Split and Cross Validation methods



***Train Test Split*** : To have unknown datapoints to test the data rather than testing with the same points with which the model was trained. This helps capture the model performance much better.

![](https://cdn-images-1.medium.com/max/1600/1*-8_kogvwmL1H6ooN1A1tsQ.png)

***Cross Validation***: When model is split into training and testing it can be possible that specific type of data point may go entirely into either training or testing portion. This would lead the model to perform poorly. Hence over-fitting and underfitting problems can be well avoided with cross validation techniques

![](https://cdn-images-1.medium.com/max/1600/1*4G__SV580CxFj78o9yUXuQ.png)


***About Stratify*** : Stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.

For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

For Reference : https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)
    
    train_scores.append(knn.score(X_train,y_train))
    test_scores.append(knn.score(X_test,y_test))

In [None]:
## score that comes from testing on the same datapoints that were used for training
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))

In [None]:
## score that comes from testing on the datapoints that were split in the beginning to be used for testing solely
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))

## Result Visualisation

In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')

#### The best result is captured at k = 11 hence 11 is used for the final model

In [None]:
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(11)

knn.fit(X_train,y_train)
knn.score(X_test,y_test)

In [None]:
## trying to plot decision boundary 

In [None]:
value = 20000
width = 20000
plot_decision_regions(X.values, y.values, clf=knn, legend=2, 
                      filler_feature_values={2: value, 3: value, 4: value, 5: value, 6: value, 7: value, 8: value, 9: value, 10: value, 11: value, 12: value, 13: value, 14: value, 15: value, 16: value, 17: value, 18: value, 19: value, 20: value, 21: value, 22: value, 23: value},
                      filler_feature_ranges={2: width, 3: width, 4: width, 5: width, 6: width, 7: width, 8: width, 9: width, 10: width, 11: width, 12: width, 13: width, 14: width, 15: width, 16: width, 17: width, 18: width, 19: width, 20: width, 21: width, 22: width, 23: width},
                      X_highlight=X_test.values)

# Adding axes annotations
#plt.xlabel('sepal length [cm]')
#plt.ylabel('petal length [cm]')
plt.title('KNN with Diabetes Data')
plt.show()

# Model Performance Analysis

## 1. Confusion Matrix

The confusion matrix is a technique used for summarizing the performance of a classification algorithm i.e. it has binary outputs.
![](https://cdn-images-1.medium.com/max/1600/0*-GAP6jhtJvt7Bqiv.png)



### ***In the famous cancer example***:


###### Cases in which the doctor predicted YES (they have the disease), and they do have the disease will be termed as TRUE POSITIVES (TP). The doctor has correctly predicted that the patient has the disease.

###### Cases in which the doctor predicted NO (they do not have the disease), and they don’t have the disease will be termed as TRUE NEGATIVES (TN). The doctor has correctly predicted that the patient does not have the disease.

###### Cases in which the doctor predicted YES, and they do not have the disease will be termed as FALSE POSITIVES (FP). Also known as “Type I error”.

###### Cases in which the doctor predicted NO, and they have the disease will be termed as FALSE NEGATIVES (FN). Also known as “Type II error”.

![](https://cdn-images-1.medium.com/max/1600/0*9r99oJ2PTRi4gYF_.jpg)

For Reference: https://medium.com/@djocz/confusion-matrix-aint-that-confusing-d29e18403327

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(X_test)
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
y_pred = knn.predict(X_test)
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

## 2. Classification Report

Report which includes Precision, Recall and F1-Score.


#### Precision Score
        TP – True Positives
        FP – False Positives

        Precision – Accuracy of positive predictions.
        Precision = TP/(TP + FP)
        
   
#### Recall Score
        FN – False Negatives

        Recall(sensitivity or true positive rate): Fraction of positives that were correctly identified.
        Recall = TP/(TP+FN)
        
#### F1 Score
        F1 Score (aka F-Score or F-Measure) – A helpful metric for comparing two classifiers.
        F1 Score takes into account precision and the recall. 
        It is created by finding the the harmonic mean of precision and recall.

        F1 = 2 x (precision x recall)/(precision + recall)
        
        
        
> > ***Precision*** - Precision is the ratio of correctly predicted positive observations to the total predicted positive observations. The question that this metric answer is of all passengers that labeled as survived, how many actually survived? High precision relates to the low false positive rate. We have got 0.788 precision which is pretty good.
> > 
> > Precision = TP/TP+FP
> > 
> > ***Recall (Sensitivity)*** - Recall is the ratio of correctly predicted positive observations to the all observations in actual class - yes. The question recall answers is: Of all the passengers that truly survived, how many did we label? A recall greater than 0.5 is good.
> > 
> > Recall = TP/TP+FN
> > 
> > ***F1 score*** - F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account. Intuitively it is not as easy to understand as accuracy, but F1 is usually more useful than accuracy, especially if you have an uneven class distribution. Accuracy works best if false positives and false negatives have similar cost. If the cost of false positives and false negatives are very different, it’s better to look at both Precision and Recall. 
> > 
> > F1 Score = 2*(Recall * Precision) / (Recall + Precision)
        
> ****

In [None]:
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import roc_curve
y_pred_proba = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

In [None]:
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=11) ROC curve')
plt.show()

In [None]:
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)

In [None]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))