<a href="https://colab.research.google.com/github/ScInglorion/AGH_ML_Course/blob/main/Projekt_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Features in dataset:
1. Age - age in years
2. Sex - sex (1 male, 0 female)
3. Chest pain type - (1 typical angina, 2 atypical angina, 3 non-anginal pain, 4 asymptomatic)
4. BP	- resting blood pressure (in mm Hg on admission to the hospital)
5. Cholesterol - serum cholestoral in mg/dl
6. FBS over 120	- (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. EKG results - resting electrocardiographic results

(0 normal,

1 having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV),

2 howing probable or definite left ventricular hypertrophy by Estes' criteria)
8. Max HR	- maximum heart rate achieved
9. Exercise angina - exercise induced angina (1 = yes; 0 = no)
10. ST depression	- ST depression induced by exercise relative to rest
11. Slope of ST	- the slope of the peak exercise ST segment (1 upsloping, 2 flat, 3 downsloping)
12. Number of vessels fluro	- number of major vessels (0-3) colored by flourosopy
13. Thallium - thalium (3 normal, 6 fixed defect, 7 reversable defect)
14. Heart Disease - diagnosis of heart disease (angiographic disease status)
        -- Value 0: < 50% diameter narrowing
        -- Value 1: > 50% diameter narrowing


In [None]:
# Importing as many things i can
import seaborn
import pandas
import numpy
from google.colab import drive
import matplotlib.pyplot
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.preprocessing import (MinMaxScaler,StandardScaler)
from sklearn.model_selection import GridSearchCV
import warnings

seaborn.set_theme() #seting the theme for the charts
drive.mount('/content/drive')
data = pandas.read_csv('/content/drive/MyDrive/Collab_project/Heart_Disease_Prediction.csv') #importing the data

data.drop_duplicates() #droping duplicates
data.dropna() #droping nan values

#removing index (cuz useless), Heart Disease (we wanted to predict it), Number of vessels fluro (out target)
Y = data['Number of vessels fluro']
X = data.drop(['index', 'Heart Disease','Number of vessels fluro'], axis = 1)

#changing non-binary categorical data to dummy variables
X_after_dummies =  pandas.get_dummies(X, columns = ['Chest pain type', 'EKG results', 'Slope of ST', 'Thallium'])

#spliting the data set in ratio 3:1:1 (Train:Validation:Test)
X_train, X_test, Y_train, Y_test = train_test_split(X_after_dummies, Y, test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25)

#removing outliers in Training set
Y_train = Y_train.drop(X_train[X_train['Cholesterol']>500].index,axis = 0)
X_train = X_train.drop(X_train[X_train['Cholesterol']>500].index,axis = 0)
Y_train = Y_train.drop(X_train[X_train['ST depression']>=4.5].index,axis = 0)
X_train = X_train.drop(X_train[X_train['ST depression']>=4.5].index,axis = 0)

#creating normalized variables
minmax_scaler = MinMaxScaler().set_output(transform="pandas")
X_train_normalized = minmax_scaler.fit_transform(X_train)
X_val_normalized = minmax_scaler.fit_transform(X_val)
X_test_normalized = minmax_scaler.fit_transform(X_test)

#creating standardized variables
standard_scaler = StandardScaler().set_output(transform="pandas")
X_train_standardized = standard_scaler.fit_transform(X_train[['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']])
X_val_standardized = standard_scaler.fit_transform(X_val[['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']])
X_test_standardized = standard_scaler.fit_transform(X_test[['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']])

X_train_standardized = pandas.concat([X_train_standardized, X_train.drop(['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'], axis = 1)], axis=1)
X_val_standardized = pandas.concat([X_val_standardized, X_val.drop(['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'], axis = 1)], axis=1)
X_test_standardized = pandas.concat([X_test_standardized, X_test.drop(['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'], axis = 1)], axis=1)

#creating PCA variables
pca_train = PCA().set_output(transform="pandas").fit(X_train_normalized)
pca_val = PCA().set_output(transform="pandas").fit(X_val_normalized)
pca_test = PCA().set_output(transform="pandas").fit(X_test_normalized)

X_train_normalized_pca = pca_train.transform(X_train)
X_val_normalized_pca = pca_val.transform(X_val)
X_test_normalized_pca = pca_test.transform(X_test)

X_train_normalized_pca = X_train_normalized_pca[['pca0','pca1','pca2','pca3']]
X_val_normalized_pca = X_val_normalized_pca[['pca0','pca1','pca2','pca3']]
X_test_normalized_pca = X_test_normalized_pca[['pca0','pca1','pca2','pca3']]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
##GridSearch code, for MLPRegressor, for different type of data

warnings.simplefilter(action='ignore', category=RuntimeWarning)

Y_train1 = pandas.concat([Y_train,Y_val])
Y_test1 = Y_test

#X_train1 = pandas.concat([X_train,X_val])
#X_test1 = X_test

#X_train1 = pandas.concat([X_train_normalized,X_val_normalized])
#X_test1 = X_test_normalized

#X_train1 = pandas.concat([X_train_standardized,X_val_standardized])
#X_test1 = X_test_standardized

X_train1 = pandas.concat([X_train_normalized_pca,X_val_normalized_pca])
X_test1 = X_test_normalized_pca

reg = MLPRegressor(random_state=1,max_iter=10000)
parameters = {'hidden_layer_sizes':[(1),(2),(3),
                                    (1,1),(1,2),(1,3),
                                    (2,1),(2,2),(2,3),
                                    (3,1),(3,2),(3,3),
                                    (1,1,1),(1,1,2),(1,1,3),
                                    (1,2,1),(1,2,2),(1,2,3),
                                    (1,3,1),(1,3,2),(1,3,3),
                                    (2,1,1),(2,1,2),(2,1,3),
                                    (2,2,1),(2,2,2),(2,2,3),
                                    (2,3,1),(2,3,2),(2,3,3),
                                    (3,1,1),(3,1,2),(3,1,3),
                                    (3,2,1),(3,2,2),(3,2,3),
                                    (3,3,1),(3,3,2),(3,3,3)], 'activation':('identity', 'logistic', 'tanh', 'relu'), 'solver':('lbfgs', 'sgd', 'adam')}
grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train1, Y_train1)

In [None]:
##Printing best parameters after grid search

print(grid_search.best_params_)

#==========================================================================================================================================
# random split No. 1
# for X_train and Y_train the result is:                {'activation': 'identity', 'hidden_layer_sizes': (2, 3, 3), 'solver': 'lbfgs'}
# for X_train_normalized and Y_train the result is:     {'activation': 'identity', 'hidden_layer_sizes': (1, 3, 1), 'solver': 'lbfgs'}
# for X_train_standardized and Y_train the result is:   {'activation': 'identity', 'hidden_layer_sizes': (2, 1, 2), 'solver': 'adam'}
# for X_train_normalized_pca and Y_train the result is: {'activation': 'identity', 'hidden_layer_sizes': (2, 2, 2), 'solver': 'sgd'}

#==========================================================================================================================================
# random split No. 2
# for X_train and Y_train the result is:                {'activation': 'identity', 'hidden_layer_sizes': 1, 'solver': 'lbfgs'}
# for X_train_normalized and Y_train the result is:     {'activation': 'identity', 'hidden_layer_sizes': (2, 1, 1), 'solver': 'adam'}
# for X_train_standardized and Y_train the result is:   {'activation': 'tanh', 'hidden_layer_sizes': 2, 'solver': 'adam'}
# for X_train_normalized_pca and Y_train the result is: {'activation': 'tanh', 'hidden_layer_sizes': (2, 2), 'solver': 'lbfgs'}

#==========================================================================================================================================
# random split No. 3
# for X_train and Y_train the result is:                {'activation': 'tanh', 'hidden_layer_sizes': (3, 2, 3), 'solver': 'adam'}
# for X_train_normalized and Y_train the result is:     {'activation': 'tanh', 'hidden_layer_sizes': (3, 2, 2), 'solver': 'adam'}
# for X_train_standardized and Y_train the result is:   {'activation': 'relu', 'hidden_layer_sizes': (3, 3), 'solver': 'adam'}
# for X_train_normalized_pca and Y_train the result is: {'activation': 'identity', 'hidden_layer_sizes': (3, 3, 1), 'solver': 'lbfgs'}

#==========================================================================================================================================
# random split No. 4
# for X_train and Y_train the result is:                {'activation': 'relu', 'hidden_layer_sizes': (3, 3, 1), 'solver': 'adam'}
# for X_train_normalized and Y_train the result is:     {'activation': 'tanh', 'hidden_layer_sizes': (2, 3, 3), 'solver': 'adam'}
# for X_train_standardized and Y_train the result is:   {'activation': 'relu', 'hidden_layer_sizes': (3, 3, 1), 'solver': 'adam'}
# for X_train_normalized_pca and Y_train the result is: {'activation': 'logistic', 'hidden_layer_sizes': (3, 2, 2), 'solver': 'lbfgs'}

#==========================================================================================================================================
# random split No. 5
# for X_train and Y_train the result is:                {'activation': 'identity', 'hidden_layer_sizes': (2, 1, 1), 'solver': 'lbfgs'}
# for X_train_normalized and Y_train the result is:     {'activation': 'relu', 'hidden_layer_sizes': (2, 3, 2), 'solver': 'lbfgs'}
# for X_train_standardized and Y_train the result is:   {'activation': 'tanh', 'hidden_layer_sizes': 2, 'solver': 'adam'}
# for X_train_normalized_pca and Y_train the result is: {'activation': 'identity', 'hidden_layer_sizes': 2, 'solver': 'lbfgs'}


{'activation': 'identity', 'hidden_layer_sizes': 2, 'solver': 'lbfgs'}


In [None]:
##Code for checking the values of found model

reg = MLPRegressor(hidden_layer_sizes=2, random_state=1, max_iter=10000, activation='identity', solver='lbfgs')
reg.fit(X_train1, Y_train1)
y_pred_train = reg.predict(X_train1)
y_pred = reg.predict(X_test1)

rmse = numpy.sqrt(numpy.mean((y_pred_train - Y_train1)**2))
rmse_testing = numpy.sqrt(numpy.mean((y_pred - Y_test1)**2))
print("rmse:",rmse)
print("rmse_testing:",rmse_testing)

#==========================================================================================================================================
# random test split No. 1
# for X_train and Y_train the results are:
# rmse:         0.8437156423568695
# rmse_testing: 0.7247217799916873
#
# for X_train_normalized and Y_train the results are:
# rmse:         0.8294705258213844
# rmse_testing: 0.7308239496694735
#
# for X_train_standardized and Y_train the results are:
# rmse:         0.8590157493087447
# rmse_testing: 0.6891325214377091
#
# for X_train_normalized_pca and Y_train the results are:
# rmse:         0.9220446085288443
# rmse_testing: 0.8653325743978505

#==========================================================================================================================================
# random test split No. 2
# for X_train and Y_train the results are:
# rmse:         0.8161560423140588
# rmse_testing: 0.7722442462028298
#
# for X_train_normalized and Y_train the results are:
# rmse:         0.8815112406923867
# rmse_testing: 0.8075814924281434
#
# for X_train_standardized and Y_train the results are:
# rmse:         0.8287945582816887
# rmse_testing: 0.8287945582816887
#
# for X_train_normalized_pca and Y_train the results are:
# rmse:         0.8738887359251216
# rmse_testing: 0.8384768089707256

#==========================================================================================================================================
# random test split No. 3
# for X_train and Y_train the results are:
# rmse:         0.8122700745735869
# rmse_testing: 0.9454545403780725
#
# for X_train_normalized and Y_train the results are:
# rmse:         0.8381347571273422
# rmse_testing: 0.8889473437882617
#
# for X_train_standardized and Y_train the results are:
# rmse:         0.7049813477125647
# rmse_testing: 0.9219754811209938
#
# for X_train_normalized_pca and Y_train the results are:
# rmse:         0.8513141289478895
# rmse_testing: 0.8981381122997653

#==========================================================================================================================================
# random test split No. 4
# for X_train and Y_train the results are:
# rmse:         0.8525801241722037
# rmse_testing: 1.216769617099928
#
# for X_train_normalized and Y_train the results are:
# rmse:         0.8098487748307407
# rmse_testing: 0.8535397608337245
#
# for X_train_standardized and Y_train the results are:
# rmse:         0.6673387337019309
# rmse_testing: 0.9323472637368998
#
# for X_train_normalized_pca and Y_train the results are:
# rmse:         0.8205308935067461
# rmse_testing: 0.8719884717963478

#==========================================================================================================================================
# random test split No. 5
# for X_train and Y_train the results are:
# rmse:         0.7889453332281064
# rmse_testing: 0.9669756792721176
#
# for X_train_normalized and Y_train the results are:
# rmse:         0.7047483413401348
# rmse_testing: 0.9439886918941585
#
# for X_train_standardized and Y_train the results are:
# rmse:         0.8075535523522982
# rmse_testing: 0.9014995363905408
#
# for X_train_normalized_pca and Y_train the results are:
# rmse:         0.8363856159846348
# rmse_testing: 0.9692139863849197

rmse: 0.8363856159846348
rmse_testing: 0.9692139863849197
