# 0. Loading packages

Uncomment cell below if not all necessary packages are installed

In [1]:
#%pip install numpy
#%pip install matplotlib
#%pip install pandas
#%pip install seaborn
#%pip install scikit-learn
#%pip install missingno
#%pip install imblearn

In [2]:
import warnings

warnings.filterwarnings('ignore')

import self_functions as sf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# 1. Loading data

In [3]:
train_data = pd.read_csv('Datasets/train.csv')
test_data = pd.read_csv('Datasets/test.csv')
test_data_ids = test_data['id']

# 2. Inspecting data

## 2.1 Inspecting missing data

In [None]:
msno.matrix(train_data)

There is no missing data in the train dataset

In [None]:
msno.matrix(test_data)

There is no missing data in the test dataset

## 2.2 Inspecting individual columns

In [None]:
for i in train_data.columns:
    sf.histplot(train_data[i])

There is a low amount of people diagnosed with hypertension and/or heart disease.</br>
The distribution for both the average glucose level and BMI is a right-skewed normal distribution.</br>
There are more women than men in the dataset, there are no people who identify as other in the dataset.</br>
The biggest group of people work at private companies.</br>
There is a 50/50 spread of people living in a rural area vs an urban area.</br>
There is an approximate 50/50 split for people who have ever smoked and people who have never smoked, for the people who have ever smoked it is split 50/50 for active smokers and former smokers.</br>

There is only a low amount of people who have ever had a stroke (imbalanced dataset), which will make it difficult to correctly predict when someone will be having a stroke, therefore later on we will be oversampling the data to make it more useable for machine learning

## 2.3 Datatypes

In [None]:
train_data.info()

## 2.4 Relationships between variables

In [None]:
sf.corrplot(train_data)

There are very low correlation between stroke and other variables, but to get rid of the most invaluable correlation a threshold of (-)0.03 will be set for removing variables.

## 2.5 Minimum requirements for Machine Learning with Scikit Learn

- No missing values: there are no missing values in the dataset which is required for machine learning with Scikit Learn
- Numeric or Boolean values: all columns are in either numeric or Boolean data types which is required for machine learning with Scikit Learn

## 2.6 Conclusion

# 3. Data preparation 

## 3.1 Column selection

In [9]:
col_to_drop = ['id', 'bmi', 'gender_Female', 'gender_Male', 'gender_Other', 'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private', 'Residence_type_Rural', 'Residence_type_Urban', 'smoking_status_never smoked', 'smoking_status_smokes']

train_data = train_data.drop(col_to_drop, axis=1)
test_data = test_data.drop(col_to_drop, axis=1)

## 3.2 Creating X and y

In [10]:
X_train = train_data.drop('stroke', axis=1)
y_train = train_data['stroke']

## 3.3 Data sampling

Check how many True/False values there are in the train set

In [None]:
y_train.value_counts()

In [12]:
smote = SMOTE(sampling_strategy='minority', random_state=0)

X_train, y_train = smote.fit_resample(X_train, y_train)

Check how many True/False values there are in the train set

In [None]:
y_train.value_counts()

## 3.4 Standardizing data

The columns containing data of the float type will be standardized using Scikit Learn's standardscaler. It is important to standardize the data, this prevents columns with high values to have an unreasonably large impact. The way the standarscaler scales is using the standard deviation and the mean to calculate the Z-score

In [14]:
sc = StandardScaler()

sf.sta_sca(sc, X_train, ['age', 'avg_glucose_level'])
sf.sta_sca(sc, test_data, ['age', 'avg_glucose_level'])

# 5. Models

## 5.1 K-Nearest Neighbours

### 5.1.1 Hyperparameter tuning

In [17]:
knn = KNeighborsClassifier()

param_grid = {'n_neighbors': np.arange(1, 21), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
knn_cv = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

### 5.1.2 Training KNN

In [None]:
knn_cv.fit(X_train, y_train)
print("Best parameters: ", knn_cv.best_params_)
print("Best cross-validation score: ", knn_cv.best_score_)

### 5.1.3 Predicting KNN

In [19]:
knn_pred = knn_cv.predict(test_data)

knn_pred_df = test_data_ids.to_frame()
knn_pred_df['stroke'] = knn_pred

### 5.1.4 Writing to CSV

In [20]:
knn_pred_df.to_csv('Datasets/Predictions/knn_pred.csv', index=False)

## 5.2 Logistic Regression

### 5.2.1 Hyperparameter tuning

In [21]:
lr = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2', 'elasticnet', None], 'C': np.logspace(-4, 4, 20), 'class_weight': ['balanced', None], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga', 'newton-cholesky']}
lr_cv = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

### 5.1.2 Training LR

In [None]:
lr_cv.fit(X_train, y_train)
print("Best parameters: ", lr_cv.best_params_)
print("Best cross-validation score: ", lr_cv.best_score_)

### 5.1.3 Predicting LR

In [23]:
lr_pred = lr_cv.predict(test_data)

lr_pred_df = test_data_ids.to_frame()
lr_pred_df['stroke'] = lr_pred

### 5.1.4 Writing to CSV

In [24]:
lr_pred_df.to_csv('Datasets/Predictions/lr_pred.csv', index=False)

## 5.3 SVM

### 5.3.1 Hyperparameter tuning

In [None]:
svc = SVC()

param_grid = {'C': np.logspace(-4, 4, 20), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'degree': np.arange(1, 21), 'gamma': ['scale', 'auto'], 'shrinking': [True, False], 'probability': [True, False], 'decision_function_shape': ['ovo', 'ovr']}
svc_cv = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

### 5.3.2 Training SVC

In [None]:
svc_cv.fit(X_train, y_train)
print("Best parameters: ", svc_cv.best_params_)
print("Best cross-validation score: ", svc_cv.best_score_)

### 5.3.3 Predicting SVC

In [None]:
svc_pred = svc_cv.predict(test_data)

svc_pred_df = test_data_ids.to_frame()
svc_pred_df['stroke'] = svc_pred

### 5.3.4 Writing to CSV

In [None]:
svc_pred_df.to_csv('Datasets/Predictions/svc_pred.csv', index=False)

## 5.4 Decision Tree

### 5.4.1 Hyperparameter tuning

In [25]:
dt = DecisionTreeClassifier()

param_grid = {'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random'], 'max_depth': np.arange(1, 51), 'min_samples_split': np.arange(1, 21), 'min_samples_leaf': np.arange(1, 21), 'max_features': ['auto', 'sqrt', 'log2']}
dt_cv = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

### 5.4.2 Training DT

In [None]:
dt_cv.fit(X_train, y_train)
print("Best parameters: ", dt_cv.best_params_)
print("Best cross-validation score: ", dt_cv.best_score_)

### 5.4.3 Predicting DT

In [None]:
dt_pred = dt_cv.predict(test_data)

dt_pred_df = test_data_ids.to_frame()
dt_pred_df['stroke'] = dt_pred

### 5.4.4 Writing to CSV

In [None]:
dt_pred_df.to_csv('Datasets/Predictions/dt_pred.csv', index=False)

## 5.5 Ensembles

### 5.5.1 Random Forest

#### 5.5.1.1 Hyperparameter tuning

In [None]:
rf = RandomForestClassifier()

param_grid = {'n_estimators': np.arange(1, 101), 'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': np.arange(1, 51), 'min_samples_split': np.arange(1, 21), 'min_samples_leaf': np.arange(1, 21), 'max_features': ['auto', 'sqrt', 'log2'], 'bootstrap': [True, False]}
rf_cv = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

#### 5.5.1.2 Training RF

In [None]:
rf_cv.fit(X_train, y_train)
print("Best parameters: ", rf_cv.best_params_)
print("Best cross-validation score: ", rf_cv.best_score_)

#### 5.5.1.3 Predicting RF

In [None]:
rf_pred = rf_cv.predict(test_data)

rf_pred_df = test_data_ids.to_frame()
rf_pred_df['stroke'] = rf_pred

#### 5.5.1.4 Writing to CSV

In [None]:
rf_pred_df.to_csv('Datasets/Predictions/rf_pred.csv', index=False)

### 5.5.2 Gradient Boosting Classifier

#### 5.5.2.1 Hyperparameter tuning

In [None]:
gb = GradientBoostingClassifier()

param_grid = {'learning_rate': np.logspace(-4, 4, 20), 'loss': ['log_loss', 'exponential'], 'n_estimators': np.arange(1, 101), 'subsample': np.logspace(-5, 1, 20), 'min_samples_leaf': np.arange(1, 21), 'min_samples_split': np.arange(1, 21), 'criterion': ['friedman_mse', 'squared_error'], 'max_depth': np.arange(1, 51), 'max_features': ['sqrt', 'log2']}
gb_cv = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

#### 5.5.2.2 Training GB

In [None]:
gb_cv.fit(X_train, y_train)
print("Best parameters: ", gb_cv.best_params_)
print("Best cross-validation score: ", gb_cv.best_score_)

#### 5.5.2.3 Predicting GB

In [None]:
gb_pred = gb_cv.predict(test_data)

gb_pred_df = test_data_ids.to_frame()
gb_pred_df['stroke'] = gb_pred

#### 5.5.2.4 Writing to CSV

In [None]:
gb_pred_df.to_csv('Datasets/Predictions/gb_pred.csv', index=False)

### 5.5.3 XGBoost