In [1]:
#IMPORTING REQUIRED LIBRARIES
from sklearn.model_selection  import train_test_split
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.preprocessing    import LabelEncoder, MinMaxScaler, OneHotEncoder 
from sklearn.model_selection  import KFold
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection  import cross_val_predict
from sklearn                  import metrics
from sklearn.metrics          import precision_score , recall_score , f1_score , confusion_matrix
from sklearn.metrics          import precision_recall_curve , PrecisionRecallDisplay 
from sklearn.metrics          import roc_curve , RocCurveDisplay , roc_auc_score
from sklearn.naive_bayes      import GaussianNB
import math
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
#IMPORTING DATA
df = pd.read_csv("data.csv" , index_col = 0)
#df.head(10)


In [8]:
#Discretizing the data
nLocation = pd.cut(df.location, bins=10 , labels=np.arange(10), right=False)
df['location'] = nLocation

nCountry = pd.cut(df.country, bins=10 , labels=np.arange(10), right=False)
df['country'] = nCountry

nAge = pd.cut(df.age, bins=3, labels=np.arange(3), right=False)
df['age'] = nAge

df.head(5)


Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos,result
0,7,2,1,2,1,0,14,31,19,12,3,1,8,1
1,7,2,0,1,0,1,14,31,19,12,3,1,0,0
2,9,2,1,1,0,1,14,31,19,12,3,1,13,0
3,8,2,0,1,1,0,14,31,19,12,3,1,0,0
4,8,2,1,1,0,0,14,31,19,12,3,1,0,0


In [9]:
#One-Hot_encoding

label_encoder = LabelEncoder()
df['location'] = label_encoder.fit_transform(df['location'])
encoder = OneHotEncoder(sparse=False)
df[['location0','location1', 'location2' , 'location3' , 'location4' , 'location5' , 'location6' , 'location7' , 'location8' , 'location9'  ]] = encoder.fit_transform(df['location'].values.reshape(-1,1))

label_encoderC = LabelEncoder()
df['country'] = label_encoderC.fit_transform(df['country'])
encoderC = OneHotEncoder(sparse=False)
df[['Country0','Country1', 'Country2' , 'Country3' , 'Country4' , 'Country5' , 'Country6' , 'Country7' , 'Country8' , 'Country9']] = encoderC.fit_transform(df['country'].values.reshape(-1,1))

label_encoderG = LabelEncoder()
df['gender'] = label_encoderG.fit_transform(df['gender'])
encoderG = OneHotEncoder(sparse=False)
df[['Male','Female', 'They']] = encoderG.fit_transform(df['gender'].values.reshape(-1,1))

df = df.drop("location", axis=1)
df = df.drop("country", axis=1)
df = df.drop("gender", axis=1)

#df.head(10)

KeyError: "None of [Index(['location0', 'location1', 'location2', 'location3', 'location4',\n       'location5', 'location6', 'location7', 'location8', 'location9'],\n      dtype='object')] are in the [columns]"

In [6]:
#Reordering the columns
df = df[['age', 'vis_wuhan', 'from_wuhan' , 'symptom1' , 'symptom2' , 'symptom3' , 'symptom4' , 'symptom5' , 'symptom6' , 'diff_sym_hos' ,'location0','location1', 'location2' , 'location3' , 'location4' , 'location5' , 'location6' , 'location7' , 'location8' , 'location9' , 'Country0','Country1', 'Country2' , 'Country3' , 'Country4' , 'Country5' , 'Country6' , 'Country7' , 'Country8' , 'Country9' ,'Male','Female' ,'They' , 'result']]
X = df.iloc[:,0:31].values
Y = df.iloc[:, 31:32].values
#df.head(10)
#print(Y)
#print(X)

KeyError: "['Country4', 'Country7', 'location8', 'Country0', 'location6', 'Male', 'Country2', 'Country1', 'Country9', 'Country6', 'Country8', 'Country5', 'location2', 'location7', 'location5', 'They', 'location4', 'location0', 'Female', 'location1', 'location3', 'location9', 'Country3'] not in index"

In [None]:
# NORMALIZATION OF DATA
scaler = MinMaxScaler(feature_range=(0, 1))
nX = scaler.fit_transform(X)
nY = scaler.fit_transform(Y)


df.head(10)

In [None]:
print(nX)
print(nY)

In [None]:
#CROSS VALIDATION

In [None]:
#SPLITTING THE DATA IN 10 FOLDS
fold = KFold(n_splits=10, shuffle=False)

for train_index, test_index in fold.split(nX):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, x_test = nX[train_index], nX[test_index]
    print("\n")
    y_train, y_test = nY[train_index], nY[test_index]
    print("__________________________________________________________________________________________")

In [None]:
#CALCULATING THE OPTIMUM HYPERPARAMTER K BY MEASURING THE ACCURACY FOR EACH VALUE OF K FROM 1 TO SQUARE ROOT N-EXAMPLES
k_range = range(1,31)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, nX, nY.ravel(), cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
    
for i in range(30):
    print(k_scores[i] , "\n")
    

In [None]:
# MAXIMUM OF THE LIST IS WHERE K IS OPTIMUM WITH HIGHEST ACCURACY VALUE
print('Length of list', len(k_scores))
print('Max of list', max(k_scores))

In [None]:
# PLOTTING THE ACCURACY FOR EACH K
# THE POINT AT WHICH THE ACCURACY IS HIGHEST IS THE OPTIMUM K

plt.plot(k_range, k_scores)
plt.xlabel('Value of K')
plt.ylabel('Accuracy')

In [None]:
# We can deduce that the optimum value of K to be used is 16 

In [None]:
#CALCULATING ACCURACY FOR EACH FOLD
knn = KNeighborsClassifier(n_neighbors=16)
scores = cross_val_score(knn, nX, nY.ravel(), cv=10, scoring='accuracy')
for i in range(10):
    print(scores[i], "\n")


print("__________________________________________________________________________________________")
#AVERAGE SCORES FOR THE 10 FOLDS
KNN_Average = scores.mean()
print("Average of the folds is: " ,scores.mean())

In [None]:
knn = KNeighborsClassifier(n_neighbors=16)
knn.fit(nX,nY.ravel())

In [None]:
#                                PREDICTION
#
#                              | Positive Prediction | Negative Prediction
# ACTUAL        Positive Class | True Positive (TP)  | False Negative (FN)
#               Negative Class | False Positive (FP) | True Negative (TN)

In [None]:
#CALCULATING RECALL AND PRECISION
#Precision = Sum c in C TruePositives_c / Sum c in C (TruePositives_c + FalsePositives_c)
#Recall = Sum c in C TruePositives_c / Sum c in C (TruePositives_c + FalseNegatives_c)

In [None]:
y_train_pred = cross_val_predict(knn,nX,nY.ravel(), cv = 10)



In [None]:
Precision_KNN = precision_score(nY, y_train_pred)
Confusion_Matrix_KNN = confusion_matrix(nY, y_train_pred)
print("The confusion Matrix is: \n " , Confusion_Matrix_KNN)
print("_______________________________________________________________")
print("Precision is " ,Precision_KNN)
print("_______________________________________________________________")
Recall_KNN = recall_score(nY, y_train_pred)
print("Recall is: " ,Recall_KNN)
print("_______________________________________________________________")
F1_KNN = f1_score(nY,y_train_pred)
print("F1 score is: " , F1_KNN)
print("_______________________________________________________________")

In [None]:
y_scores = cross_val_predict(knn,nX,nY.ravel(), cv=10)

In [None]:
precisions, recalls, thresholds = precision_recall_curve(nY, y_scores)

In [None]:
display = PrecisionRecallDisplay.from_estimator(knn,nX,nY)
_ = display.ax_.set_title("Precision Vs Recall curve KNN")

In [None]:
fpr, tpr, threshold = roc_curve(nY, y_scores)
plt.figure(figsize = (8,6))
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.title("KNN")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
print(precisions)
print(recalls)
print(thresholds)

In [None]:
AUC_KNN = roc_auc_score(nY, y_scores)
print("Area Under the Curve is: " , AUC_KNN)

In [None]:
# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
print(nX)
print("__________________________________________________________________________________________")
print(nY)

In [None]:
log_clf = LogisticRegression()
scores_Log = cross_val_score(log_clf, nX, nY.ravel(), cv=10, scoring='accuracy')
Log_Average = scores_Log.mean()
for i in range(10):
    print(scores_Log[i] , "\n")

In [None]:
print("Logistic Regression Average of the folds is: " ,Log_Average)
print("KNN Average of the folds is                : ", KNN_Average)

In [None]:
log_clf.fit(nX,nY.ravel())
y_train_log = cross_val_predict(log_clf,nX,nY.ravel(), cv = 10)
precisions_log, recalls_log, thresholds_log = precision_recall_curve(nY, y_train_log)
display = PrecisionRecallDisplay.from_estimator(log_clf,nX,nY)
_ = display.ax_.set_title("Precision Vs Recall curve Logistic Regression")

In [None]:
fpr_log, tpr_log, threshold_log = roc_curve(nY, y_train_log)
plt.figure(figsize = (8,6))
plt.plot(fpr_log, tpr_log, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.title("Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
AUC_log = roc_auc_score(nY, y_train_log)
print("Area Under the Curve is: " , AUC_log)

In [None]:
print(precisions_log)
print(recalls_log)
print(thresholds_log)

In [None]:
# Comparing Logistic Regression vs KNN Classifiers


In [None]:
plt.figure(figsize = (8,6))
plt.plot(fpr_log, tpr_log, 'b')
plt.plot(fpr, tpr, 'g')
plt.plot([0, 1], [0, 1],'r--')
plt.legend(["Log", "KNN" , "threshold"], loc ="lower right")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
print("Area Under the Curve for logistic Regression Classifier is: " , AUC_log)
print("Area Under the Curve for KNN classifier is                : " , AUC_KNN)

In [None]:
# NAIVE BAYES CLASSIFIER

In [None]:
#print(nX)
#print("__________________________________________________________________________________________")
#print(nY)

In [None]:
nb_clf = GaussianNB(priors = None)
scores_NB = cross_val_score(nb_clf, nX, nY.ravel(), cv=10, scoring='accuracy')
NB_Average = scores_NB.mean()
for i in range(10):
    print(scores_NB[i] , "\n")

  

In [None]:
print("KNN Average of the folds is                : ", KNN_Average)
print("Logistic Regression Average of the folds is: " ,Log_Average)
print("Naive Bayes classifier Average of the folds is: " , NB_Average)  

In [None]:
nb_clf.fit(nX,nY.ravel())
y_train_nb = cross_val_predict(nb_clf,nX,nY.ravel(), cv = 10)
precisions_nb, recalls_nb, thresholds_nb = precision_recall_curve(nY, y_train_nb)
display = PrecisionRecallDisplay.from_estimator(nb_clf,nX,nY)
_ = display.ax_.set_title("Precision Vs Recall curve Naive Bayes Classifier")

In [None]:
fpr_nb, tpr_nb, threshold_nb = roc_curve(nY, y_train_nb)
plt.figure(figsize = (8,6))
plt.plot(fpr_nb, tpr_nb, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.title("Naive Bayes")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
AUC_nb = roc_auc_score(nY, y_train_nb)
print("Area Under the Curve is: " , AUC_nb)

In [None]:
print(precisions_nb)
print(recalls_nb)
print(thresholds_nb)

In [None]:
plt.figure(figsize = (8,6))
plt.plot(fpr_log, tpr_log, 'b')
plt.plot(fpr, tpr, 'g')
plt.plot(fpr_nb, tpr_nb, 'y')
plt.plot([0, 1], [0, 1],'r--')
plt.legend(["Log", "KNN" ,"Naive Bayes" ,  "threshold"], loc ="lower right")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
print("Area Under the Curve for logistic Regression Classifier is: " , AUC_log)
print("Area Under the Curve for KNN classifier is                : " , AUC_KNN)
print("Area Under the Curve for Naive Bayes classifier is        : " , AUC_nb)


In [None]:
# Precision vs Recall Curves 

display = PrecisionRecallDisplay.from_estimator(knn,nX,nY)
_ = display.ax_.set_title("Precision Vs Recall curve KNN")

display = PrecisionRecallDisplay.from_estimator(log_clf,nX,nY)
_ = display.ax_.set_title("Precision Vs Recall curve Logistic Regression")

display = PrecisionRecallDisplay.from_estimator(nb_clf,nX,nY)
_ = display.ax_.set_title("Precision Vs Recall curve Naive Bayes Classifier")