In [None]:
#importing the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
#importing the data
data = pd.read_csv('stroke_data.csv')
#showing the first 5 records of the data set
data.head()


Έχουμε 12 χαρακτηριστικά. Το χαρακτηριστικό id είναι ένας μοναδικός αριθμός που δίνεται σε κάθε ασθενή, οπότε δεν χρειάζεται για την ανάλυσή μας. Για αυτόν τον λόγο διαγράφουμε την συγκεκριμένη στήλη.

In [None]:
#Dropping the id attribute column
data.drop(columns=['id'],inplace=True)
#showing the first 5 records of the data set
data.head()

Για τα 11 χαρακτηριστικά που έμειναν έχουμε:

**Gender**: Το φύλλο του ασθενούς Male/Female/Other

**Age**: Η ηλικία του ασθενούς

**Hypertension**: Αν ο ασθενής έχει (1) ή δεν έχει (0) υπέρταση

**Heart Disease**: Αν ο ασθενής έχει (1) ή δεν έχει (0) κάποια ασθένεια της καρδιάς

**Ever Married**: Αν έχει παντρευτεί ποτέ ο ασθενής Yes/No

**Work Type**: Ο τύπος εργασίας του ασθενούς Children/Govt_job/Never_worked/Private/Self-employed

**Residence Type**: Περιβάλλος διαβίωσης του ασθενούς Rural/Urban

**Average Glucose Level**: Μέσο επίπεδο γλυκόζης στο αίμα του ασθενούς

**BMI**: Δείκτης μάζας σώματος του ασθενούς

**Smoking Status**: Σχέση του ασθενούς με το κάπνισμα Formerly_smoked/never_smoked/smokes/Unknown

**Stroke**: Ο ασθενής είχε (1) ή δεν είχε (0) πάθει εγκεφαλικό. Πρόκειται για τον δείκτη που θέλουμε να προβλέψουμε


Πριν προχωρήσουμε στην κατασκευή των δεδομένων πρέπει να κοιτάξουμε αν υπάρχουν ελλιπή δεδομένα, που μπορεί να επηρεάσουν την αποδόση των μοντέλων μας. Αυτό κάνουμε στην συνέχεια.

In [None]:
#list with the column/attribute names
columns = list(data.columns)
#iterating through the data to find NaN values
for name in columns:
    nans = data.loc[data[name].isna(),name].size
    print(f'There are {nans} NaN values in column {name}')

Με βάση το προηγούμενο αποτέλεσμα βλέπουμε ότι NaN τιμές υπάρχουν μόνο στο χαρακτηριστικό που αφορά τον δείκτη μάζας σώματος. Άρα πρέπει να καθαρίσουμε αυτές τις ελλιπείς τιμές. Για να το κάνουμε αυτό αντικαθιστούμε αυτές τις NaN τιμές με την πιό αντιπροσωπευτική τιμή αυτού του χαρακτηριστικού, που είναι η μέση τιμή των υπαρχουσών τιμών.

In [None]:
#series with the not NaN data values for the attribute BMI
not_nan_data = data.loc[data['bmi'].notna(),'bmi']

#mean value of the bmi attribute
mean_bmi = not_nan_data.mean()

#replacing nan values of the bmi attribute with the mean value that we just calculated
data['bmi'].fillna(mean_bmi,inplace=True)

#checking again if there are NaN values
for name in columns:
    nans = data.loc[data[name].isna(),name].size
    print(f'There are {nans} NaN values in column {name}')


# **Stroke**

In [None]:
#plots about the stroke attribute which we want to predict
stroked = len(data.loc[data['stroke']== 1]) #number of lines with Stroke=1
not_stroked = len(data.loc[data['stroke'] == 0]) #number of lines with Stroke=0
#putting the results in a list
stroke_values = [not_stroked,stroked]
stroke_labels = ['No Stroke','Stroke'] #labels
stroke_explode = [0.1,0.1] #needed for the pie chart
#creating a pie chart
plt.figure()
plt.pie(stroke_values,explode=stroke_explode,labels=stroke_labels,autopct='%1.3f%%')
plt.title('Pie chart with the distribution of classes')
plt.xlabel(f'Stroke={stroked}, No stroke={not_stroked}')
plt.savefig(fname='Stroke_pie_chart.png',format='png')
plt.show()

#creating a bar plot for the 2 class labels
plt.figure()
plt.bar(['No Stroke','Stroke'],stroke_values,width=0.6)
plt.title('Number of rows for each class label')
plt.xlabel('0=No stroke, 1=Stroke')
plt.savefig(fname='Stroke_bar_plot.png',format='png')
plt.show()

# **Gender**

In [None]:
#plots about the gender attribute
#for the whole data set
males = len(data.loc[data['gender']=='Male']) #number of rows with Gender=Male
females = len(data.loc[data['gender']=='Female']) #number of rows with Gender=Female
other = len(data.loc[data['gender']=='Other']) #number of row with Gender=Other
pie_gender_sizes = [males,females,other]
gender_labels = ['Male','Female','Other'] #labels
explode = [0.075,0.075,0.1] #needed for the pie chart

#Pie chart
plt.figure()
plt.pie(pie_gender_sizes,explode=explode,labels=gender_labels,autopct='%1.5f%%',shadow=True)
plt.title('Percentage of genders in the data set')
plt.xlabel(f'Males={males}, Females={females}, Other={other}')
plt.savefig(fname='genders_pie_all.png',format='png')
plt.show()

#for the data with stroke=0
males_0 = len(data.loc[(data['gender']=='Male') & (data['stroke']==0)]) 
females_0 = len(data.loc[(data['gender']=='Female') & (data['stroke']==0)])
other_0 = len(data.loc[(data['gender']=='Other') & (data['stroke']==0)])
pie_gender_sizes_0 = [males_0,females_0,other_0]

#Pie chart
plt.figure()
plt.pie(pie_gender_sizes_0,explode=explode,labels=gender_labels,autopct='%1.5f%%',shadow=True)
plt.title('Percentage of genders without a stroke')
plt.xlabel(f'Males={males_0}, Females={females_0}, Other={other_0}')
plt.savefig(fname='genders_pie_no_stroke.png',format='png')
plt.show()

#for data with stroke=1
males_1 = len(data.loc[(data['gender']=='Male') & (data['stroke']==1)])
females_1 = len(data.loc[(data['gender']=='Female') & (data['stroke']==1)])
other_1 = len(data.loc[(data['gender']=='Other') & (data['stroke']==1)])
pie_gender_sizes_1 = [males_1,females_1,other_1]

#Pie chart
plt.figure()
plt.pie(pie_gender_sizes_1,explode=explode,labels=gender_labels,autopct='%1.5f%%',shadow=True)
plt.title('Percentage of genders which suffered a stroke')
plt.xlabel(f'Males={males_1}, Females={females_1}, Other={other_1}')
plt.savefig(fname='genders_pie_stroke.png',format='png')
plt.show()

# **Age**


In [None]:
#boxplots for the age attribute
all_data = data['age']
age_stroke = data.loc[data['stroke']==1,'age'] #age data for patients with Stroke=1
age_no_stroke = data.loc[data['stroke']==0,'age'] #age data for patients with Stroke=0
age_labels = ['Age','Stroke','No Stroke'] #labels

#Creating boxplots for the Age attribute
plt.figure()
plt.boxplot([all_data,age_stroke,age_no_stroke],labels=age_labels,patch_artist=True,showmeans=True,)
plt.ylabel('Age')
plt.title('Boxplots for the Age attribute')
plt.grid()
plt.savefig(fname='age_boxplots.png',format='png')
plt.show()

In [None]:
#Median ages
stroke_median_age = data['age'].loc[data['stroke']==1].median()
print(f'Median age of patients who suffered a stroke: {stroke_median_age}')
no_stroke_median_age = data['age'].loc[data['stroke']==0].median()
print(f'Median age of patients with no stroke: {no_stroke_median_age}')

Παρατηρούμε ότι οι ασθενείς που έχουν πάθει εγκεφαλικό είναι κατα μέσο όρο μεγάλης ηλικίας σε αντίθεση με αυτούς που δεν έχουν πάθει εγκεφαλικό. Με βάση αυτό καταλαβαίνουμε ότι το χαρακτηριστικό της ηλικίας θα είναι ένας από τους πιό σημαντικούς predictors για τα μοντέλα που θα κατασκευάσουμε. Επίσης αξίζει να σχολιάσουμε ότι στους ασθενέις που έχουν υποστεί εγκεφαλικό υπάρχουν και 2 outliers που αφορούν ασθενείς που έπαθαν το εγκεφαλικό νέοι. Στην συνέχεια εντοπίζουμε τις εγγραφές αυτών των outlier

In [None]:
#finding the outliers
outlier = data.loc[(data['age']<20) & data['stroke']==1]
print(outlier)

Κλινικά δεν είναι σωστό να εμφανίζουν έμφραγμα αυτοί οι ασθενείς, οπότε είναι πολύ πιθανό να έχουν γίνει λάθος καταγραφές. Για αυτόν τον λόγο διαγράφουμε αυτές τις εγγραφές.

In [None]:
#dropping the outliers
data.drop(index=[162,245],inplace = True)

In [None]:
#reseting the indices
data.reset_index(inplace=True)

Ξανακάνουμε τα boxplot για να δούμε ότι δεν υπάρχουν outliers πλέον

In [None]:
#boxplots for the age attribute
all_data = data['age']
age_stroke = data.loc[data['stroke']==1,'age'] #age data for patients with stroke=1
age_no_stroke = data.loc[data['stroke']==0,'age'] #age data for patients with stroke=0
age_labels = ['Age','Stroke','No Stroke'] #labels

#boxplots for the Age attribute
plt.figure()
plt.boxplot([all_data,age_stroke,age_no_stroke],labels=age_labels,patch_artist=True,showmeans=True,)
plt.ylabel('Age')
plt.title('Boxplots for the Age attribute')
plt.grid()
plt.savefig(fname='age_boxplot_no_outliers.png',format='png')
plt.show()

# Hypertension

In [None]:
#for the whole data set
hypertension = len(data.loc[data['hypertension']==1]) #data with Hypertension=1
no_hypertension = len(data.loc[data['hypertension']==0]) #data with Hypertension=0

#lists needed for the pie chart
pie_hypertension_sizes = [no_hypertension,hypertension]
hypertension_labels = ['No Hypertension','Hypertension']
explode = [0.075,0.075]

#pie chart 
plt.figure()
plt.pie(pie_hypertension_sizes,explode=explode,labels=hypertension_labels,autopct='%1.2f%%',shadow=True)
plt.title('Hypertension Percentages')
plt.xlabel(f'Hypertension={hypertension}, No Hypertension={no_hypertension}')
plt.savefig(fname='hypertension_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
hypertension_0 = len(data.loc[(data['hypertension']==1) & (data['stroke']==0)])
no_hypertension_0 = len(data.loc[(data['hypertension']==0) & (data['stroke']==0)])
pie_hypertension_sizes_0 = [no_hypertension_0,hypertension_0]

#pie chart 
plt.figure()
plt.pie(pie_hypertension_sizes_0,explode=explode,labels=hypertension_labels,autopct='%1.2f%%',shadow=True)
plt.title('Hypertension Percentages for Patients with no stroke')
plt.xlabel(f'Hypertension={hypertension_0}, No Hypertension={no_hypertension_0}')
plt.savefig(fname='hypertension_pie_no_stroke.png',format='png')
plt.show()

#for the data with Stroke=1
hypertension_1 = len(data.loc[(data['hypertension']==1) & (data['stroke']==1)])
no_hypertension_1 = len(data.loc[(data['hypertension']==0) & (data['stroke']==1)])
pie_hypertension_sizes_1 = [no_hypertension_1,hypertension_1]

#pie chart 
plt.figure()
plt.pie(pie_hypertension_sizes_1,explode=explode,labels=hypertension_labels,autopct='%1.2f%%',shadow=True)
plt.title('Hypertension Percentages for Patients who suffered a stroke')
plt.xlabel(f'Hypertension={hypertension_1}, No Hypertension={no_hypertension_1}')
plt.savefig(fname='hypertension_pie_stroke.png',format='png')
plt.show()

#bar plots
#creating a bar plot for the hypertension Attribute
plt.figure()
plt.bar(['No Hypertension','Hypertension'],pie_hypertension_sizes,width=0.6)
plt.xlabel('0=No hypertension, 1=Hypertension')
plt.title('Bar plot for the Hypertension attribute')
plt.savefig(fname='hypertension_bar_plot.png',format='png')
plt.show()

# Heart Disease

In [None]:
#for the whole data set
hd = len(data.loc[data['heart_disease']==1])
no_hd = len(data.loc[data['heart_disease']==0])

pie_hd_sizes = [no_hd,hd]
hd_labels = ['No Heart Disease','Heart Disease']
explode = [0.075,0.075]
#pie chart 
plt.figure()
plt.pie(pie_hd_sizes,explode=explode,labels=hd_labels,autopct='%1.2f%%',shadow=True)
plt.title('Heart Disease Percentages')
plt.xlabel(f'Heart Disease={hd}, No Heart Disease={no_hd}')
plt.savefig(fname='hd_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
hd_0 = len(data.loc[(data['heart_disease']==1) & (data['stroke']==0)])
no_hd_0 = len(data.loc[(data['heart_disease']==0) & (data['stroke']==0)])
pie_hd_sizes_0 = [no_hd_0,hd_0]

#pie chart 
plt.figure()
plt.pie(pie_hd_sizes_0,explode=explode,labels=hd_labels,autopct='%1.2f%%',shadow=True)
plt.title('Heart disease Percentages for Patients with no stroke')
plt.xlabel(f'Heart Disease={hd_0}, No Heart Disease={no_hd_0}')
plt.savefig(fname='hd_pie_no_Stroke.png',format='png')
plt.show()

#for the data with Stroke=1
hd_1 = len(data.loc[(data['heart_disease']==1) & (data['stroke']==1)])
no_hd_1 = len(data.loc[(data['heart_disease']==0) & (data['stroke']==1)])
pie_hd_sizes_1 = [no_hd_1,hd_1]

#pie chart 
plt.figure()
plt.pie(pie_hd_sizes_1,explode=explode,labels=hd_labels,autopct='%1.2f%%',shadow=True)
plt.title('Heart disease Percentages for Patients who suffered a stroke')
plt.xlabel(f'Heart Disease={hd_1}, No Heart Disease={no_hd_1}')
plt.savefig(fname='hd_pie_stroke.png',format='png')
plt.show()

#bar plots
#creating a bar plot
plt.figure()
plt.bar(['No Heart Disease','Heart Disease'],pie_hd_sizes,width=0.6)
plt.xlabel('0=No Heart Disease, 1=Heart Disease')
plt.title('Bar plot for the Heart Disease attribute')
plt.savefig(fname='hd_bar_plot.png',format='png')
plt.show()

# Ever Married

In [None]:
#for all the data
married = len(data.loc[data['ever_married']=='Yes'])
single = len(data.loc[data['ever_married']=='No'])
pie_sizes = [single,married]
pie_labels = ['Never married','Married']
explode = [0.00,0.00]

#pie chart 
plt.figure()
plt.pie(pie_sizes,explode=explode,labels=pie_labels,autopct='%1.2f%%',shadow=True)
plt.title('Marriage Percentages')
plt.xlabel(f'Married={married}, Never Married={single}')
plt.savefig(fname='married_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
married_0 = len(data.loc[(data['ever_married']=='Yes') & (data['stroke']==0)])
single_0 = len(data.loc[(data['ever_married']=='No') & (data['stroke']==0)])
pie_sizes_0 = [single_0,married_0]

plt.figure()
plt.pie(pie_sizes_0,explode=explode,labels=pie_labels,autopct='%1.2f%%',shadow=True)
plt.title('Marriage Percentages for the patients with no stroke')
plt.xlabel(f'Married={married_0}, Never Married={single_0}')
plt.savefig(fname='married_pie_no_Stroke.png',format='png')
plt.show()

#for the data with Stroke=1
married_1 = len(data.loc[(data['ever_married']=='Yes') & (data['stroke']==1)])
single_1 = len(data.loc[(data['ever_married']=='No') & (data['stroke']==1)])
pie_sizes_1 = [single_1,married_1]

plt.figure()
plt.pie(pie_sizes_1,explode=explode,labels=pie_labels,autopct='%1.2f%%',shadow=True)
plt.title('Marriage Percentages for the patients who suffered a stroke')
plt.xlabel(f'Married={married_1}, Never Married={single_1}')
plt.savefig(fname='married_pie_stroke.png',format='png')
plt.show()

# Work Type

In [None]:
#for all the data
children = len(data.loc[data['work_type']=='children'])
govt = len(data.loc[data['work_type']=='Govt_job'])
never = len(data.loc[data['work_type']=='Never_worked'])
private = len(data.loc[data['work_type']=='Private'])
self = len(data.loc[data['work_type']=='Self-employed'])
work_values = [children,govt,never,private,self]
work_labels = ['Children','Goverment','Never Worked','Private','Self-Employed']
explode=[0,0.15,0.55,0.05,0]
#pie chart
plt.figure()
plt.pie(work_values,explode=explode,labels=work_labels,autopct='%1.2f%%',shadow=True)
plt.title('Work type percentages',y=1.2)
plt.xlabel(f'Children={children}, Goverment={govt}, Never Worked={never}, Private={private}, Self-employed={self}')
plt.savefig(fname='work_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
children_0 = len(data.loc[(data['work_type']=='children') & (data['stroke']==0)])
govt_0 = len(data.loc[(data['work_type']=='Govt_job') & (data['stroke']==0)])
never_0 = len(data.loc[(data['work_type']=='Never_worked') & (data['stroke']==0)])
private_0 = len(data.loc[(data['work_type']=='Private') & (data['stroke']==0)]) 
self_0 = len(data.loc[(data['work_type']=='Self-employed') & (data['stroke']==0)])
work_values_0 = [children_0,govt_0,never_0,private_0,self_0]

#pie chart
plt.figure()
plt.pie(work_values_0,explode=explode,labels=work_labels,autopct='%1.2f%%',shadow=True)
plt.title('Work type percentages for patients with no stroke',y=1.2)
plt.xlabel(f'Children={children_0}, Goverment={govt_0}, Never Worked={never_0}, Private={private_0}, Self-employed={self_0}')
plt.savefig(fname='work_pie_no_Stroke.png',format='png')
plt.show()

#for the data with Stroke=1
children_1 = len(data.loc[(data['work_type']=='children') & (data['stroke']==1)])
govt_1 = len(data.loc[(data['work_type']=='Govt_job') & (data['stroke']==1)])
never_1 = len(data.loc[(data['work_type']=='Never_worked') & (data['stroke']==1)])
private_1 = len(data.loc[(data['work_type']=='Private') & (data['stroke']==1)]) 
self_1 = len(data.loc[(data['work_type']=='Self-employed') & (data['stroke']==1)])
work_values_1 = [children_1,govt_1,never_1,private_1,self_1]

#pie chart
plt.figure()
plt.pie(work_values_1,explode=explode,labels=work_labels,autopct='%1.2f%%',shadow=True)
plt.title('Work type percentages for patients who suffered stroke',y=1.2)
plt.xlabel(f'Children={children_1}, Goverment={govt_1}, Never Worked={never_1}, Private={private_1}, Self-employed={self_1}')
plt.savefig(fname='work_pie_stroke.png',format='png')
plt.show()

# Residence Type

In [None]:
#for all the data
rural = len(data.loc[data['Residence_type']=='Rural'])
urban = len(data.loc[data['Residence_type']=='Urban'])
rt_values = [rural,urban]
rt_labels = ['Rural','Urban']
explode=[0,0]
#pie chart
plt.figure()
plt.pie(rt_values,explode=explode,labels=rt_labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Rural={rural}, Urban={urban}')
plt.title('Residence type of the patients')
plt.savefig(fname='residence_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
rural_0 = len(data.loc[(data['Residence_type']=='Rural') & (data['stroke']==0)])
urban_0 = len(data.loc[(data['Residence_type']=='Urban') & (data['stroke']==0)])
rt_values_0 = [rural_0,urban_0]

plt.figure()
plt.pie(rt_values_0,explode=explode,labels=rt_labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Rural={rural_0}, Urban={urban_0}')
plt.title('Residence Type of patients with no stroke')
plt.savefig(fname='residence_pie_no_stroke.png',format='png')
plt.show()

#for the data with Stroke=1
rural_1 = len(data.loc[(data['Residence_type']=='Rural') & (data['stroke']==1)])
urban_1 = len(data.loc[(data['Residence_type']=='Urban') & (data['stroke']==1)])
rt_values_1 = [rural_1,urban_1]

plt.figure()
plt.pie(rt_values_1,explode=explode,labels=rt_labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Rural={rural_1}, Urban={urban_1}')
plt.title('Residence Type of patients who suffered a stroke')
plt.savefig(fname='residence_pie_stroke.png',format='png')
plt.show()

# Smoking status

In [None]:
#for all the data
fs = len(data.loc[data['smoking_status']=='formerly smoked'])
ns = len(data.loc[data['smoking_status']=='never smoked'])
s = len(data.loc[data['smoking_status']=='smokes'])
u = len(data.loc[data['smoking_status']=='Unknown'])
s_values = [fs,ns,s,u]
s_labels = ['Formerly Smoked','Never Smoked','Smokes','Unknown']
explode = [0,0,0,0]

#pie chart
plt.figure()
plt.pie(s_values,explode=explode,labels=s_labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Formerly Smoked={fs}, Never Smoked={ns}, Smokes={s}, Unknonw={u}')
plt.title('Smoking status of the patients')
plt.savefig(fname='smoke_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
fs_0 = len(data.loc[(data['smoking_status']=='formerly smoked') & (data['stroke']==0)])
ns_0 = len(data.loc[(data['smoking_status']=='never smoked') & (data['stroke']==0)])
s_0 = len(data.loc[(data['smoking_status']=='smokes') & (data['stroke']==0)])
u_0 = len(data.loc[(data['smoking_status']=='Unknown') & (data['stroke']==0)])
s_values_0 = [fs_0,ns_0,s_0,u_0]

plt.figure()
plt.pie(s_values_0,explode=explode,labels=s_labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Formerly Smoked={fs_0}, Never Smoked={ns_0}, Smokes={s_0}, Unknonw={u_0}')
plt.title('Smoking status of the patients with no stroke')
plt.savefig(fname='smoke_pie_no_Stroke.png',format='png')
plt.show()

#for the data with Stroke=1
fs_1 = len(data.loc[(data['smoking_status']=='formerly smoked') & (data['stroke']==1)])
ns_1 = len(data.loc[(data['smoking_status']=='never smoked') & (data['stroke']==1)])
s_1 = len(data.loc[(data['smoking_status']=='smokes') & (data['stroke']==1)])
u_1 = len(data.loc[(data['smoking_status']=='Unknown') & (data['stroke']==1)])
s_values_1 = [fs_1,ns_1,s_1,u_1]

plt.figure()
plt.pie(s_values_1,explode=explode,labels=s_labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Formerly Smoked={fs_1}, Never Smoked={ns_1}, Smokes={s_1}, Unknonw={u_1}')
plt.title('Smoking status of the patients who suffered a stroke')
plt.savefig(fname='smoke_pie_stroke.png',format='png')
plt.show()

# BMI

In [None]:
#bmi weight categories for all the data
underweight = len(data.loc[data['bmi']<=18.5])
normal = len(data.loc[(data['bmi']>18.5) & (data['bmi']<=24.9)])
overweight = len(data.loc[(data['bmi']>24.9) & (data['bmi']<=29.9)])
obese = len(data.loc[(data['bmi']>29.9) & (data['bmi']<=34.9)])
severely_obese = len(data.loc[(data['bmi']>34.9) & (data['bmi'] <= 39.9)])
morbidly_obese = len(data.loc[data['bmi']>39.9])
classes = [underweight,normal,overweight,obese,severely_obese,morbidly_obese]
labels=['Under\nweight','Normal','Over\nweight','Obese','Severely\nObese','Morbidly\nObese']

#bar plot of the weight classes
plt.figure
plt.bar(labels,classes,width=1,edgecolor='black')
plt.xlabel('Weight Classes')
plt.title('Weight Classes of the Patients')
plt.savefig(fname='bmi_barplot_all.png',format='png')
plt.show()

#pie chart
explode=[0,0,0,0,0,0]
plt.figure()
plt.pie(classes,explode=explode,labels=labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Underweight={underweight}, Normal={normal}, Overweight={overweight}, Obese={obese}, Severely Obese={severely_obese},Morbidly Obese={morbidly_obese}')
plt.title('Pie chart of the weight classes of the Patients')
plt.savefig(fname='bmi_pie_all.png',format='png')
plt.show()

#for the data with Stroke=0
underweight_0 = len(data.loc[(data['bmi']<=18.5) & (data['stroke']==0)])
normal_0 = len(data.loc[(data['bmi']>18.5) & (data['bmi']<=24.9) & (data['stroke']==0)])
overweight_0 = len(data.loc[(data['bmi']>24.9) & (data['bmi']<=29.9) & (data['stroke']==0)])
obese_0 = len(data.loc[(data['bmi']>29.9) & (data['bmi']<=34.9) & (data['stroke']==0)])
severely_obese_0 = len(data.loc[(data['bmi']>34.9) & (data['bmi'] <= 39.9) & (data['stroke']==0)])
morbidly_obese_0 = len(data.loc[(data['bmi']>39.9) & (data['stroke']==0)])
classes_0 = [underweight_0,normal_0,overweight_0,obese_0,severely_obese_0,morbidly_obese_0]

#bar plot of the weight classes
plt.figure
plt.bar(labels,classes_0,width=1,edgecolor='black')
plt.xlabel('Weight Classes')
plt.title('Weight Classes of the Patients with no stroke')
plt.savefig(fname='bmi_barplot_no_stroke.png',format='png')
plt.show()

plt.figure()
plt.pie(classes_0,explode=explode,labels=labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Underweight={underweight_0}, Normal={normal_0}, Overweight={overweight_0}, Obese={obese_0}, Severely Obese={severely_obese_0},Morbidly Obese={morbidly_obese_0}')
plt.title('Pie chart of the weight classes of the Patients with no stroke')
plt.savefig(fname='bmi_pie_no_Stroke.png',format='png')
plt.show()

#for the data with Stroke=1
underweight_1 = len(data.loc[(data['bmi']<=18.5) & (data['stroke']==1)])
normal_1 = len(data.loc[(data['bmi']>18.5) & (data['bmi']<=24.9) & (data['stroke']==1)])
overweight_1 = len(data.loc[(data['bmi']>24.9) & (data['bmi']<=29.9) & (data['stroke']==1)])
obese_1 = len(data.loc[(data['bmi']>29.9) & (data['bmi']<=34.9) & (data['stroke']==1)])
severely_obese_1 = len(data.loc[(data['bmi']>34.9) & (data['bmi'] <= 39.9) & (data['stroke']==1)])
morbidly_obese_1 = len(data.loc[(data['bmi']>39.9) & (data['stroke']==1)])
classes_1 = [underweight_1,normal_1,overweight_1,obese_1,severely_obese_1,morbidly_obese_1]

#bar plot of the weight classes
plt.figure
plt.bar(labels,classes_1,width=1,edgecolor='black')
plt.xlabel('Weight Classes')
plt.title('Weight Classes of the Patients who suffered a stroke')
plt.savefig(fname='bmi_barplot_stroke.png',format='png')
plt.show()

explode = [0.5,0.25,0,0,0,0.25]
plt.figure()
plt.pie(classes_1,explode=explode,labels=labels,autopct='%1.2f%%',shadow=True)
plt.xlabel(f'Underweight={underweight_1}, Normal={normal_1}, Overweight={overweight_1}, Obese={obese_1}, Severely Obese={severely_obese_1},Morbidly Obese={morbidly_obese_1}')
plt.title('Pie chart of the weight classes of the Patients who suffered a stroke')
plt.savefig(fname='bmi_pie_stroke.png',format='png')
plt.show()

# Average Glucose Level

In [None]:
data['avg_glucose_level'].min()
#for all the data
g_levels = data['avg_glucose_level']

plt.figure()
plt.hist(g_levels)
plt.xlabel('Average Glucose (mg/dl)')
plt.title('Average Glucose Levels of the Patients')
plt.savefig(fname='avg_hist_all.png',format='png')
plt.show()

#for the data with Stroke=0
g_levels_0 = data.loc[data['stroke']==0,'avg_glucose_level']

plt.figure()
plt.hist(g_levels_0)
plt.xlabel('Average Glucose (mg/dl)')
plt.title('Average Glucose Levels of the Patients with no stroke')
plt.savefig(fname='avg_hist_no_Stroke.png',format='png')
plt.show()

#for the data with Stroke=1
g_levels_1 = data.loc[data['stroke']==1,'avg_glucose_level']

plt.figure()
plt.hist(g_levels_1)
plt.xlabel('Average Glucose (mg/dl)')
plt.title('Average Glucose Levels of the Patients who suffered a stroke')
plt.savefig(fname='avg_hist_stroke.png',format='png')
plt.show()

# Προετοιμασία των δεδομένων

--> Αρχικά πρέπει να εξισσοροπήσουμε τα δεδομένα γιατί η μία κλάση έχει πολλά περισσότερα δεδομένα από την άλλη. Για να το κάνουμε αυτό χρησιμοποιούμε μια τεχνική Oversampling της κλάσης μειοψηφίας που ονομάζεται SMOTE.
--> Επίσης τα δεδομένα περιέχουν και κατηγορικά χαρακτηριστικά που δεν τα δέχονται οι αλγόριθμοι που θα χρησιμοποιήσουμε. Εδώ υπάρχουν δύο τρόποι αντιμετώπισης. Ο πρώτος είναι να διώξουμε αυτά τα κατηγορικά χαρακτηριστικά και ο δεύτερος είναι να εφαρμόσουμε μια τεχνική που λέγεται One Hot Encoding και να τα μετατρέψουμε σε αριθμητικά. Θα εφαρμόσουμε και τις δύο μεθόδους ώστε να δούμε πώς επηρεάζεται κάθε φορά η απόδοση του μοντέλου.
--> Τέλος θα κανονικοποιήσουμε τα δεδομένα πριν τα χρησιμοποιήσουμε στους αλγορίθμους ταξινόμησης.

In [None]:
#dropping the irrelevant column INDEX
data.drop(columns=['index'],inplace=True)

In [None]:
#changing some categorical attributes to 0 and 1 because the algorithms we use cant support categorical values
data['gender']=data['gender'].apply(lambda x : 1 if x=='Male' else 0) 
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)

In [None]:
#the new form of the data set
data.head()

## Περίπτωση δεδομένων όπου έχουμε διαγράψει τις κατηγορικές στήλες.

In [None]:
#Data without the Categorical attributes Work Type and Smoking Status
X = data.drop(columns=['stroke']) #removing the labels from the data set
#dropping the categorical attributes
X.drop(columns=['work_type'],inplace=True)
X.drop(columns=['smoking_status'],inplace=True)
#class labels of the data
y = data['stroke']

print('Before oversampling')
print(f'Shape of the data set: {X.shape}')
print(f'Shape of the class labels list: {y.shape}')

#showing the first five rows of the new data set
X.head()


In [None]:
#balancing the Data using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print('After oversampling')
print(f'Shape of the data set: {X.shape}')
print(f'Shape of the class labels list: {y.shape}')
print(f'Number of records with Stroke=0: {len(y.loc[y==0])}')
print(f'Number of records with Stroke=1: {len(y.loc[y==1])}')

X.head()

In [None]:
#splitting the data into Train and Test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,y,random_state=0)

print(f'Sizes: \nX_train={X_train.shape},\nX_test={X_test.shape},\nY_train={Y_train.shape},\nY_test={Y_test.shape}.')

In [None]:
#normalizing the data using Standard scaler
sc = StandardScaler() #initializing a StandardScaler class instance
#applying the normalization
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Περίπτωση των δεδομένων που εφαρμόζουμε One Hot Encoding

In [None]:
#applying one hot encoding to the attributes Work Type and Smoking Status
one_hot = data[['smoking_status','work_type']]
one_hot = pd.get_dummies(one_hot)
one_hot.head()

In [None]:
#dropping the categorical columns and the class labels
Xoh = data.drop(columns=['stroke','smoking_status','work_type'])
#merging the one hot into the dataset
Xoh = Xoh.merge(one_hot,left_index=True,right_index=True,how='left') 
#class labels of the data
yoh = data['stroke']

print('Before oversampling')
print(f'Shape of the data set: {Xoh.shape}')
print(f'Shape of the class labels list: {yoh.shape}')

In [None]:
#balancing the data using smote
oversample = SMOTE()
Xoh, yoh = oversample.fit_resample(Xoh, yoh)

print('After oversampling')
print(f'Shape of the data set: {Xoh.shape}')
print(f'Shape of the class labels list: {yoh.shape}')
print(f'Number of records with Stroke=0: {len(y.loc[yoh==0])}')
print(f'Number of records with Stroke=1: {len(y.loc[yoh==1])}')

Xoh.head()

In [None]:
#splitting the data into Train and Test sets
Xoh_train, Xoh_test, Yoh_train, Yoh_test = train_test_split(Xoh,yoh,random_state=0)

print(f'Sizes: \nX_train={Xoh_train.shape},\nX_test={Xoh_test.shape},\nY_train={Yoh_train.shape},\nY_test={Yoh_test.shape}.')

In [None]:
#normalizing the data using Standard scaler
sc = StandardScaler() #initializing a StandardScaler class instance
#applying the normalization
Xoh_train = sc.fit_transform(Xoh_train)
Xoh_test = sc.transform(Xoh_test)

Έχουμε έτοιμα τα δεδομένα με τα οποία θα δουλέψουμε, οπότε μπορούμε να αρχίσουμε να στήνουμε τα μοντέλα μας.

# Decision Trees

### Για τα δεδομένα χωρίς τα κατηγορικά χαρακτηριστικά

In [None]:
#Setting up and training the tree trying different max depth values so we can choose the best one

depth_list = [i for i in range(1,31)] #list with the possible max depth values
accuracy_list =[] #initializing a list for the accuracy values
f1_score_list = [] #initializing a list for the f1 score values

#training and testing trees with different max depths - Criterion = Gini
for depth in depth_list:
    dtree = DecisionTreeClassifier(criterion='gini',max_depth=depth,random_state=0)
    dtree.fit(X_train,Y_train) #training
    m_ac = dtree.score(X_test,Y_test) #mean accuracy of the tree
    y_pred = dtree.predict(X_test) #predicted values for the test data
    f1 = f1_score(Y_test,y_pred) #f1 score of the tree
    f1_score_list.append(f1) #putting the f1 value in the list
    accuracy_list.append(m_ac) #putting the accuracy value in the list
    
    
#max value and index of the mean accuracy values
ac_max_value = max(accuracy_list)
ac_list_index = accuracy_list.index(ac_max_value)
#max value and index of the f1 score
max_value_f1 = max(f1_score_list)
list_index_f1 = f1_score_list.index(max_value_f1)

print(f'Gini index: Max accuracy = {ac_max_value}, Max f1 = {max_value_f1}')
print(f'Index: {ac_list_index},{list_index_f1}')

#max depth value = index + 1 cause indices start from 0 in python

#plotting the result
plt.figure
plt.plot(depth_list,accuracy_list,label='Accuracy')
plt.plot(ac_list_index+1,accuracy_list[ac_list_index],'o')
plt.plot(depth_list,f1_score_list,label='F1 score')
plt.plot(list_index_f1+1,max_value_f1,'o')
plt.xlabel('Max depth of the Tree')
plt.ylabel('Score')
plt.title('Max Depth vs Score for Gini Index')
plt.legend()
plt.savefig(fname='dtree_gini_depth_vs_score.png',format='png')
plt.show()

accuracy_list_e =[] #initializing a list for the accuracy values (entropy criterion)
f1_score_list_e = [] #initializing a list for the f1 score values (entropy criterion)

#training and testing trees with different max depths - Criterion = Entropy
for depth in depth_list:
    dtree2 = DecisionTreeClassifier(criterion='entropy',max_depth=depth,random_state=0)
    dtree2.fit(X_train,Y_train) #training
    m_ac = dtree2.score(X_test,Y_test) #mean accuracy of the tree
    y_pred = dtree2.predict(X_test) #predicted values for the test data
    f1 = f1_score(Y_test,y_pred) #f1 score of the tree
    f1_score_list_e.append(f1) #putting the f1 value in the list
    accuracy_list_e.append(m_ac) #putting the accuracy value in the list
    
#max value and index of the mean accuracy values
ac_max_value_e = max(accuracy_list_e)
ac_list_index_e = accuracy_list_e.index(ac_max_value_e)
#max value and index of the f1 score
max_value_f1_e = max(f1_score_list_e)
list_index_f1_e = f1_score_list_e.index(max_value_f1_e)

print(f'Entropy: Max accuracy = {ac_max_value_e}, Max f1 = {max_value_f1_e}')
print(f'Index: {ac_list_index_e},{list_index_f1_e}')

#plotting the result
plt.figure
plt.plot(depth_list,accuracy_list_e,label='Accuracy')
plt.plot(ac_list_index_e+1,accuracy_list_e[ac_list_index_e],'o')
plt.plot(depth_list,f1_score_list_e,label='F1 score')
plt.plot(list_index_f1_e+1,max_value_f1_e,'o')
plt.xlabel('Max depth of the Tree')
plt.ylabel('Score')
plt.title('Max Depth vs Score for Entropy')
plt.legend()
plt.savefig(fname='dtree_entropy_depth_vs_score.png',format='png')
plt.show()

In [None]:
#training a tree with max_depth = final_depth and criterion=Gini. This is our final model of a decision tree
final_depth = 20
crit = 'gini'
final_tree = DecisionTreeClassifier(criterion=crit,max_depth=final_depth,random_state=0)
final_tree.fit(X_train,Y_train) #training
Y_pred_final_dtree = final_tree.predict(X_test) #predicted labels for the test data

In [None]:
#Calculating evaluation metrics of our final model
accuracy_value = final_tree.score(X_test,Y_test) #accuracy
f1_score_value = f1_score(Y_test,Y_pred_final_dtree) #f1 score
precision_value = precision_score(Y_test,Y_pred_final_dtree) #precision score
recall_value = recall_score(Y_test,Y_pred_final_dtree) #recall score
#printing the results
print(f'Accuracy = {accuracy_value}\nF1-score = {f1_score_value}\nPrecision = {precision_value}\nRecall = {recall_value}')

### Για τα δεδομένα με τα κατηγορικά χαρακτηριστικά και το One Hot Encoding

In [None]:
#setting up trees with different max depth values and using gini and Entropy
depth_list = [i for i in range(1,31)] #list with the possible max depth values
accuracy_list =[] #initializing a list for the accuracy values
f1_score_list = [] #initializing a list for the f1 score values

#training and testing trees with different max depths - Criterion = Gini
for depth in depth_list:
    dtree = DecisionTreeClassifier(criterion='gini',max_depth=depth,random_state=0)
    dtree.fit(Xoh_train,Yoh_train) #training
    m_ac = dtree.score(Xoh_test,Yoh_test) #mean accuracy of the tree
    y_pred = dtree.predict(Xoh_test) #predicted values for the test data
    f1 = f1_score(Yoh_test,y_pred) #f1 score of the tree
    f1_score_list.append(f1) #putting the f1 value in the list
    accuracy_list.append(m_ac) #putting the accuracy value in the list
    
    
#max value and index of the mean accuracy values
ac_max_value = max(accuracy_list)
ac_list_index = accuracy_list.index(ac_max_value)
#max value and index of the f1 score
max_value_f1 = max(f1_score_list)
list_index_f1 = f1_score_list.index(max_value_f1)

print(f'Gini index: Max accuracy = {ac_max_value}, Max f1 = {max_value_f1}')
print(f'Index: {ac_list_index},{list_index_f1}')

#max depth value = index + 1 cause indices start from 0 in python

#plotting the result
plt.figure
plt.plot(depth_list,accuracy_list,label='Accuracy')
plt.plot(ac_list_index+1,accuracy_list[ac_list_index],'o')
plt.plot(depth_list,f1_score_list,label='F1 score')
plt.plot(list_index_f1+1,max_value_f1,'o')
plt.xlabel('Max depth of the Tree')
plt.ylabel('Score')
plt.title('Max Depth vs Score for Gini Index')
plt.legend()
plt.savefig(fname='dtree_gini_OH_depth_vs_score.png',format='png')
plt.show()

accuracy_list_e =[] #initializing a list for the accuracy values (Criterion entropy)
f1_score_list_e = [] #initializing a list for the f1 score values (Criterion entropy)

#training and testing trees with different max depths - Criterion = Entropy
for depth in depth_list:
    dtree2 = DecisionTreeClassifier(criterion='entropy',max_depth=depth,random_state=0)
    dtree2.fit(Xoh_train,Yoh_train) #training
    m_ac = dtree2.score(Xoh_test,Yoh_test) #mean accuracy of the tree
    y_pred = dtree2.predict(Xoh_test) #predicted values for the test data
    f1 = f1_score(Yoh_test,y_pred) #f1 score of the tree
    f1_score_list_e.append(f1) #putting the f1 value in the list
    accuracy_list_e.append(m_ac) #putting the accuracy value in the list
    
#max value and index of the mean accuracy values
ac_max_value = max(accuracy_list_e)
ac_list_index = accuracy_list_e.index(ac_max_value)
#max value and index of the f1 score
max_value_f1 = max(f1_score_list_e)
list_index_f1 = f1_score_list_e.index(max_value_f1)

print(f'Entropy: Max accuracy = {ac_max_value}, Max f1 = {max_value_f1}')
print(f'Index: {ac_list_index},{list_index_f1}')

#plotting the result
plt.figure
plt.plot(depth_list,accuracy_list_e,label='Accuracy')
plt.plot(ac_list_index+1,accuracy_list_e[ac_list_index],'o')
plt.plot(depth_list,f1_score_list_e,label='F1 score')
plt.plot(list_index_f1+1,max_value_f1,'o')
plt.xlabel('Max depth of the Tree')
plt.ylabel('Score')
plt.title('Max Depth vs Score for Entropy')
plt.legend()
plt.savefig(fname='dtree_entropy_OH_depth_vs_score.png',format='png')
plt.show()


In [None]:
#training a tree with max_depth = final_depth and criterion=gini. This is our final model of a decision tree
depth = 18
crit = 'gini'
final_tree_OH = DecisionTreeClassifier(criterion=crit,max_depth=depth,random_state=0)
final_tree_OH.fit(Xoh_train,Yoh_train) #training
Yoh_pred_final_dtree = final_tree_OH.predict(Xoh_test) #predicted labels for the test data

#Calculating evaluation metrics of our final model
accuracy_value_OH = final_tree_OH.score(Xoh_test,Yoh_test) #accuracy
f1_score_value_OH = f1_score(Yoh_test,Yoh_pred_final_dtree) #f1 score
precision_value_OH = precision_score(Yoh_test,Yoh_pred_final_dtree) #precision score
recall_value_OH = recall_score(Yoh_test,Yoh_pred_final_dtree) #recall score
#printing the results
print(f'Accuracy = {accuracy_value_OH}\nF1-score = {f1_score_value_OH}\nPrecision = {precision_value_OH}\nRecall = {recall_value_OH}')

# Gaussian Naive Bayes

## Για τα δεδομένα χωρίς τα κατηγορικά χαρακτηριστικά

In [None]:
#initializing an instance of the Gaussian Naive Bayes class
gauss_model = GaussianNB()
#training the gaussian naive bayes model with the train data set
gauss_model.fit(X_train, Y_train)
#predicting the labels of the test data set
Y_pred_final_GNB = gauss_model.predict(X_test)

In [None]:
#Calculating evaluation metrics of our final model
accuracy_value = gauss_model.score(X_test,Y_test) #accuracy
f1_score_value = f1_score(Y_test,Y_pred_final_GNB) #f1 score
precision_value = precision_score(Y_test,Y_pred_final_GNB) #precision score
recall_value = recall_score(Y_test,Y_pred_final_GNB) #recall score
#printing the results
print(f'Accuracy = {accuracy_value}\nF1-score = {f1_score_value}\nPrecision = {precision_value}\nRecall = {recall_value}')

## Για τα δεδομένα με τα κατηγορικά χαρακτηριστικά και το One Hot Encoding

In [None]:
#applying Gaussian naive bayes to the new data set after the one hot encoding
gauss_model2 = GaussianNB()
gauss_model2.fit(Xoh_train, Yoh_train) #training
#predicting the labels of the test data set
Yoh_pred_final_GNB = gauss_model2.predict(Xoh_test)

In [None]:
#Calculating evaluation metrics of our final model
accuracy_value_OH = gauss_model2.score(Xoh_test,Yoh_test) #accuracy
f1_score_value_OH = f1_score(Yoh_test,Yoh_pred_final_GNB) #f1 score
precision_value_OH = precision_score(Yoh_test,Yoh_pred_final_GNB) #precision score
recall_value_OH = recall_score(Yoh_test,Yoh_pred_final_GNB) #recall score
#printing the results
print(f'Accuracy = {accuracy_value_OH}\nF1-score = {f1_score_value_OH}\nPrecision = {precision_value_OH}\nRecall = {recall_value_OH}')

Παρατηρούμε ότι η ακρίβεια και το μέτρο f1 μειώθηκαν. Άρχισε να φαίνεται η επίδραση της αύξησης του αριθμού των predictor λόγω του one hot encoding στο αποτέλεσμα. Λεπτομέρειες στην τεχνική έκθεση που συνοδεύει τον κώδικα.

# k-Nearest Neighbors

## Για τα δεδομένα χωρίς τα κατηγορικά χαρακτηριστικά

In [None]:
#training a Knn classifier with k = 5 for two different metrics
p_metrics = [1,2] #p=1 Cityblock distance, p=2 Euclidean distance

for p_value in p_metrics:
    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = p_value) #class instance
    knn.fit(X_train,Y_train) #training the model with the train data set
    y_pred = knn.predict(X_test) #predicting the classes for the test data set
    acc = knn.score(X_test,Y_test) #accuracy of the current model
    f1 = f1_score(Y_test,y_pred) #f1 score of the current model
    if p_value == 1:
        print('For the Cityblock distance:\n')
        print(f'Accuracy = {acc}\nF1-score = {f1}\n')
    else:
        print('For the Euclidean distance:\n')
        print(f'Accuracy = {acc}\nF1-score = {f1}')  


In [None]:
#training the final model with k=5
knn_final = KNeighborsClassifier(n_neighbors = 5,metric='minkowski',p=2) #initializing an instance of the knn class
knn_final.fit(X_train,Y_train) #training the model
Y_pred_final_knn = knn_final.predict(X_test) #predicting the class labels for the test dataset

In [None]:
#calculating evaluation metrics for our final model
accuracy_value_knn = knn_final.score(X_test,Y_test) #accuracy
f1_score_value_knn = f1_score(Y_test,Y_pred_final_knn) #f1 score
precision_value_knn = precision_score(Y_test,Y_pred_final_knn) #precision
recall_value_knn = recall_score(Y_test,Y_pred_final_knn) #recall
print('For the knn model:\n')
print(f'Accuracy = {accuracy_value_knn}')
print(f'F1-score = {f1_score_value_knn}')
print(f'Precision = {precision_value_knn}')
print(f'Recall = {recall_value_knn}')

## Για τα δεδομένα με τα κατηγορικά χαρακτηριστικά και το One Hot Encoding

In [None]:
#training a Knn classifier with k = 5 for two different metrics
p_metrics = [1,2] #p=1 Cityblock distance, p=2 Euclidean distance

for p_value in p_metrics:
    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = p_value) #class instance
    knn.fit(Xoh_train,Yoh_train) #training the model with the train data set
    y_pred = knn.predict(Xoh_test) #predicting the classes for the test data set
    acc = knn.score(Xoh_test,Yoh_test) #accuracy of the current model
    f1 = f1_score(Yoh_test,y_pred) #f1 score of the current model
    if p_value == 1:
        print('For the Cityblock distance:\n')
        print(f'Accuracy = {acc}\nF1-score = {f1}\n')
    else:
        print('For the Euclidean distance:\n')
        print(f'Accuracy = {acc}\nF1-score = {f1}')  


In [None]:
#training the final model with k=5
knn_final_OH = KNeighborsClassifier(n_neighbors = 5,metric='minkowski',p=2) #initializing an instance of the knn class
knn_final_OH.fit(Xoh_train,Yoh_train) #training the model
Y_pred_final_OH_knn = knn_final_OH.predict(Xoh_test) #predicting the class labels for the test dataset

In [None]:
#calculating evaluation metrics for our final model
accuracy_value_OH_knn = knn_final_OH.score(Xoh_test,Yoh_test) #accuracy
f1_score_value_OH_knn = f1_score(Yoh_test,Y_pred_final_OH_knn) #f1 score
precision_value_OH_knn = precision_score(Yoh_test,Y_pred_final_OH_knn) #precision
recall_value_OH_knn = recall_score(Yoh_test,Y_pred_final_OH_knn) #recall
print('For the knn model:\n')
print(f'Accuracy = {accuracy_value_OH_knn}')
print(f'F1-score = {f1_score_value_OH_knn}')
print(f'Precision = {precision_value_OH_knn}')
print(f'Recall = {recall_value_OH_knn}')

# Random Forests 

## Για τα δεδομένα χωρίς τα κατηγορικά χαρακτηριστικά

In [None]:
#final Random Forest model with 200 estimators
rf_final = RandomForestClassifier(n_estimators = 200, criterion = 'gini',max_features=None, random_state = 0) #initializing a Random Forest 
rf_final.fit(X_train,Y_train) #training the model
Y_pred_final_RF = rf_final.predict(X_test) #predciting the class labels for the Test data

#calculating some evaluation metrics for our model
accuracy_value_RF = rf_final.score(X_test,Y_test) #accuracy
f1_score_value_RF = f1_score(Y_test,Y_pred_final_RF) #f1 score
precision_value_RF = precision_score(Y_test,Y_pred_final_RF) #precision
recall_value_RF = recall_score(Y_test,Y_pred_final_RF) #recall

print('For the Random Forest model:')
print(f'Accuracy = {accuracy_value_RF}')
print(f'F1-score = {f1_score_value_RF}')
print(f'Precision = {precision_value_RF}')
print(f'Recall = {recall_value_RF}')

## Για τα δεδομένα με τα κατηγορικά χαρακτηριστικά και το One Hot Encoding

In [None]:
#Final Random Forest model for the One Hot data with 200 estimators and gini criterion
rf_final_OH = RandomForestClassifier(n_estimators = 200, criterion = 'gini',max_features=None, random_state = 0) #initializing a Random Forest 
rf_final_OH.fit(Xoh_train,Yoh_train) #training the model
Y_pred_final_RF_OH = rf_final_OH.predict(Xoh_test) #predciting the class labels for the Test data

#calculating some evaluation metrics for our model
accuracy_value_RF_OH = rf_final_OH.score(Xoh_test,Yoh_test) #accuracy
f1_score_value_RF_OH = f1_score(Yoh_test,Y_pred_final_RF_OH) #f1 score
precision_value_RF_OH = precision_score(Yoh_test,Y_pred_final_RF_OH) #precision
recall_value_RF_OH = recall_score(Yoh_test,Y_pred_final_RF_OH) #recall

print('For the Random Forest model:')
print(f'Accuracy = {accuracy_value_RF_OH}')
print(f'F1-score = {f1_score_value_RF_OH}')
print(f'Precision = {precision_value_RF_OH}')
print(f'Recall = {recall_value_RF_OH}')

# Σύγκριση μοντέλων μέσω της καμπύλης ROC

In [None]:
#For the Decision Trees
dtree_pred = final_tree.predict_proba(X_test)
dtree_fpr, dtree_tpr, _ = roc_curve(Y_test,dtree_pred[:,1])

dtree_OH_pred = final_tree_OH.predict_proba(Xoh_test)
dtree_OH_fpr, dtree_OH_tpr, _ = roc_curve(Yoh_test,dtree_OH_pred[:,1])

#For the Gaussian Naive Bayes model
GNB_pred = gauss_model.predict_proba(X_test)
GNB_fpr, GNB_tpr, _ = roc_curve(Y_test,GNB_pred[:,1])

GNB_OH_pred = gauss_model2.predict_proba(Xoh_test)
GNB_OH_fpr, GNB_OH_tpr, _ = roc_curve(Yoh_test,GNB_OH_pred[:,1])

#For the knn model
knn_pred = knn_final.predict_proba(X_test)
knn_fpr, knn_tpr, _ = roc_curve(Y_test,knn_pred[:,1])

knn_OH_pred = knn_final_OH.predict_proba(Xoh_test)
knn_OH_fpr, knn_OH_tpr, _ = roc_curve(Yoh_test,knn_OH_pred[:,1])

#For the Random Forest model
rf_pred = rf_final.predict_proba(X_test)
rf_fpr, rf_tpr, _ = roc_curve(Y_test,rf_pred[:,1])

rf_OH_pred = rf_final_OH.predict_proba(Xoh_test)
rf_OH_fpr, rf_OH_tpr, _ = roc_curve(Yoh_test,rf_OH_pred[:,1])

In [None]:
#plotting the ROC curves for all models
plt.figure()
plt.plot(dtree_fpr,dtree_tpr,label='Decision Tree')
plt.plot(dtree_OH_fpr,dtree_OH_tpr,label='Decision Tree+OH')
plt.plot(GNB_fpr,GNB_tpr,label='Gaussian Naive Bayes')
plt.plot(GNB_OH_fpr,GNB_OH_tpr,label='Gaussian Naive Bayes+OH')
plt.plot(knn_fpr,knn_tpr,label='kNN')
plt.plot(knn_OH_fpr,knn_OH_tpr,label='kNN+OH')
plt.plot(rf_fpr,rf_tpr,label='Random Forest')
plt.plot(rf_OH_fpr,rf_OH_tpr,label='Random Forest+OH')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve of the models')
plt.legend()
plt.savefig(fname='ROC_curve.png',format='png')
plt.show()