<a href="https://colab.research.google.com/github/Varij-Saini/Final-Project-ML/blob/main/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title
import numpy as np
import pandas as pd
from google.colab import files

In [None]:
uploaded=files.upload()

In [None]:
import io
dataset = pd.read_csv(io.BytesIO(uploaded['stroke-data.csv']))
dataset.drop('id', inplace=True, axis=1)

In [None]:
print(dataset)

In [None]:
# fill missing values of BMI with mean 
bmi_mean=dataset['bmi'].mean()
dataset['bmi']=dataset['bmi'].fillna(bmi_mean)


In [None]:
#The big picture of the dataset
import seaborn as sns
sns.pairplot(data=dataset,hue='stroke',palette='bright')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Import label encoder 
from sklearn.preprocessing import LabelEncoder

# label_encoder object knows how to understand word labels 
label_encoder = LabelEncoder()
# Encode labels in columns
dataset['gender']= label_encoder.fit_transform(dataset['gender'])
dataset['ever_married']= label_encoder.fit_transform(dataset['ever_married']) 
dataset['work_type']= label_encoder.fit_transform(dataset['work_type']) 
dataset['Residence_type']= label_encoder.fit_transform(dataset['Residence_type'])
dataset['smoking_status']= label_encoder.fit_transform(dataset['smoking_status'])   
print(dataset.head())

In [None]:
#Separate dataset into features and target value
X = dataset.drop(['stroke'],axis=1)
Y = dataset.iloc[:, -1]

In [None]:
# Split data for training and testing 
X_train, X_test, Y_train , Y_test = train_test_split(X,Y, test_size=0.2 , random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Scale the training Data
SC=MinMaxScaler()
X_train = SC.fit_transform(X_train)

In [None]:
#Total count of people not having stroke and people having stroke along with its graph
print(dataset['stroke'].value_counts())
dataset['stroke'].value_counts().sort_index().plot.bar()

You can see from the above graph that the data is imbalanced


Deep Neural network

In [None]:
from tensorflow import keras
from keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt

#training and validation set 
x1_train, x1_test, y1_train, y1_test = train_test_split(X,Y, test_size=0.5, random_state=42)
x1_train = SC.fit_transform(x1_train)

#create DNN model and compile with optimizer
model = keras.models.Sequential()
model.add(Dense(15, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='Adam',
              loss= keras.losses.MeanSquaredError(),
              metrics=['mae'])
#fit the model 
history = model.fit(x1_train, y1_train, epochs=100, batch_size=35, validation_data=(x1_test, y1_test))

#get each loss value to plot on graph
loss = history.history['loss']
mae = history.history['mae']
val_loss = history.history['val_loss']
val_mae = history.history['val_mae']

#plot all values over 100 epochs
epochs = range(1,101)
plt.plot(epochs, loss, 'g', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='validation loss')
plt.plot(epochs, mae, 'r', label='mae')
plt.plot(epochs, val_mae, 'y', label='val_mae')
plt.title('Graph of Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

Visualizations of DNN 

In [None]:
from keras.utils.vis_utils import plot_model
import visualkeras
from ann_visualizer.visualize import ann_viz 

# 3 different visualizations of the DNN 
ann_viz(model, title="DNN visualiztion", view=True, filename='DNN')
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True, show_layer_activations=True)
visualkeras.layered_view(model, legend=True)


Polynomial Classification 

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
#Scale data
sc = StandardScaler()
x_train_poly = sc.fit_transform(X_train)

#set the SVC with the polynomial kernel function and fit the data
classifier = SVC(kernel='poly', random_state= 0)
classifier.fit(x_train_poly,Y_train)

#check accuracy score based on predicted vs actual values
y_pred = classifier.predict(X_test)
accuracy_score(Y_test,y_pred)


Random Forest Model(Random forest Classifier)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE 
oversample = RandomOverSampler(sampling_strategy='minority') #As we saw above the data was imbalanced, so to balance it, we are using oversampling
X=dataset.drop(['stroke'],axis=1)
Y=dataset['stroke']
X_oversample, y_oversample = oversample.fit_resample(X, Y)
oversample=SMOTE() #SMOTE is Synthetic Minority Oversampling technique(we are oversampling the minority that is Stroke(value=1))
X_train,Y_train=oversample.fit_resample(X,Y.ravel()) #converting y to 1D array

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
random_forest = RandomForestClassifier(n_estimators = 100, criterion= 'entropy', random_state = 0)
random_forest.fit(X_train,Y_train)

In [None]:
#Accuracy Score
from sklearn.metrics import accuracy_score
Y_train_rf = random_forest.predict(X_train)
accuracy_train_rf = accuracy_score(Y_train, Y_train_rf)

y_pred_test_rf = random_forest.predict(X_test)
accuracy_test_rf = accuracy_score(Y_test, y_pred_test_rf)
print(accuracy_train_rf)
print(accuracy_test_rf)

