# Loading the data and exploring its shape and values

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
mit_test = pd.read_csv('/kaggle/input/heartbeat/mitbih_test.csv',header=None)
mit_train = pd.read_csv('/kaggle/input/heartbeat/mitbih_train.csv', header=None)
ptb_abnormal = pd.read_csv('/kaggle/input/heartbeat/ptbdb_abnormal.csv', header=None)
ptb_normal = pd.read_csv('/kaggle/input/heartbeat/ptbdb_normal.csv', header=None)

In [None]:
mit_test.head()

In [None]:
mit_train.head()

In [None]:
ptb_abnormal.head()

In [None]:
ptb_normal.head()

In [None]:
mit_test.rename(columns={187:"Class"}, inplace=True)
mit_train.rename(columns={187:"Class"}, inplace=True)
ptb_abnormal.rename(columns={187:"Class"}, inplace=True)
ptb_normal.rename(columns={187:"Class"}, inplace=True)

Looking at how many classes are there in each dataset
The MIT dataset has 5 clases:
* 0 = N  (Normal Beat)
* 1 = S  (Supraventricular premature beat)
* 2 = V  (Premature ventricular contraction)
* 3 = F  (Fusion of ventricular and normal beat)
* 4 = Q  (Unclassifiable beat)

Compared to the PTB dataset which is 1 for abnormal and 0 for normal


In [None]:
print ("MIT Train classes: \n", mit_train["Class"].value_counts())
print ("\nMIT Test classes: \n", mit_test["Class"].value_counts())
print ("\nPTB Abnormal classes: \n", ptb_abnormal["Class"].value_counts())
print ("\nPTB Normal classes: \n", ptb_normal["Class"].value_counts())

In [None]:
# Setting Dictionary to define the type of Heartbeat for both datasets
MIT_Outcome = {0. : 'Normal Beat',
               1. : 'Supraventricular premature beat',
               2. : 'Premature ventricular contraction',
               3. : 'Fusion of ventricular and normal beat',
               4. : 'Unclassifiable beat'}
PTB_Outcome = {0. : 'Normal',
               1. : 'Abnormal'}

# Generating Plots of some of the samples in the dataset

In [None]:
#Plotting 10 random samples from the MIT training dataset with their classification
plt.figure(figsize=(25,10))
np_count = np.linspace(0,186,187)
np_time = np.tile(np_count,(10,1))
rnd = np.random.randint(0,mit_train.shape[0],size=(10,))


for i in range(np_time.shape[0]):
    ax = plt.subplot(2,5,i+1)
    ax.plot(mit_train.iloc[rnd[i],np_time[i,:]])
    ax.set_title(MIT_Outcome[mit_train.loc[rnd[i],'Class']])

plt.show()


In [None]:
#Plotting 10 random samples from the PTB training dataset with their classification
plt.figure(figsize=(25,10))
rnd = np.random.randint(0,ptb_normal.shape[0],size=(5,))
rnd1 = np.random.randint(0,ptb_abnormal.shape[0], size=(5,))


for i in range(np_time.shape[0]):
    ax = plt.subplot(2,5,i+1)
    if (i < 5):
        ax.plot(ptb_normal.iloc[rnd[i],np_time[i,:]])
        ax.set_title(PTB_Outcome[ptb_normal.loc[rnd[i],'Class']])
    else:
        ax.plot(ptb_abnormal.iloc[rnd1[i-5],np_time[i,:]])
        ax.set_title(PTB_Outcome[ptb_abnormal.loc[rnd1[i-5],'Class']])

plt.show()

# Experimenting with Classifiers for PTB Dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.svm import SVC 
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

ptb_full = pd.concat([ptb_normal, ptb_abnormal], axis=0).reset_index()
ptb_full.drop(columns='index', inplace=True)
ptb_full = ptb_full.sample(ptb_full.shape[0], random_state=42)
learn_ptb, test_ptb, out_learn_ptb, out_test_ptb = train_test_split(ptb_full.iloc[:,:187], ptb_full.iloc[:,-1], test_size=0.15, random_state=42)
train_ptb, valid_ptb, out_train_ptb, out_valid_ptb = train_test_split(learn_ptb, out_learn_ptb, test_size=0.2, random_state=42 )

In [None]:
print("Traing dataset size: ", train_ptb.shape)
print("Validation dataset size: ", valid_ptb.shape)
print("Test dataset size: ", test_ptb.shape)

In [None]:
#Normalizing the training & test data 
train_ptb = normalize(train_ptb, axis=0, norm='max')
valid_ptb = normalize(valid_ptb, axis=0, norm='max')
test_ptb = normalize(test_ptb, axis=0, norm='max')

In [None]:
# validating that the training data has a sample from both classess
np.unique(out_train_ptb)

# Running SVM

We use a GridSearchCV to find the bets parameters for SVM model with F1 micro scoring (F1 score "weighted" based on the data, so that the imbalanced data doesn't skew it towards the more abundant class)

In [None]:
#Looking at the plots we can see that there are a lot "zero" values which will not likely help our classification.  Eyeballing the data I chose 100 features to keep.
svc = SVC(kernel='rbf', class_weight='balanced')

param_grid = {'C': [1, 5, 10]}
grid_svc = GridSearchCV (svc, param_grid, verbose=2, scoring='f1_micro')

# Train the grid of models. Time this process.
%time grid_svc.fit(train_ptb, out_train_ptb)

# Print the parameters which yield the best model performance
print (grid_svc.best_estimator_)

In [None]:
# Selecting the best parameters from the previos GridSearchCV and predicting values on our validation set.
svc = grid_svc.best_estimator_
pred_svc = svc.predict(valid_ptb)


print(classification_report(out_valid_ptb, pred_svc, target_names=[PTB_Outcome[i] for i in PTB_Outcome]))

In [None]:
#validating that the predictions contained both classes
np.unique(pred_svc)

## SVM observations:

The SVM model performed quite well with F1 score of 0.91 (for Normal) and 0.96 (for Abnormal) considering that the data was imblalanced.  The overall accuracy was 95% which is also impressive.  I don't believe there is anything tha would stop us from using such a model but lets see if other models can outperform it.

# Running ExtraTreesClassifier

Considering that this is a time series dataset and how the prior value impacts the current value our intuition is that this model would perform badly as "randomly" selecting features and making decisions based on these values would make for an archiac model.

In [None]:
forest = ExtraTreesClassifier (criterion='entropy', max_samples=10, class_weight='balanced', random_state=42)

param_grid = {'n_estimators': [10, 20, 30],
             'max_depth' : [5, 10, 15, 20]}
grid_forest = GridSearchCV(forest, param_grid, scoring='f1_micro', verbose=2)

grid_forest.fit(train_ptb, out_train_ptb)

print(grid_forest.best_params_)

In [None]:
# loading the best estimator from the GridSearchCV into our model
forest = grid_forest.best_estimator_

# predicting the outcome by using the best model
pred_forest = forest.predict(valid_ptb)
print(classification_report(out_valid_ptb, pred_forest, target_names=[PTB_Outcome[i] for i in PTB_Outcome]))

In [None]:
# Validating that the model wasn't able to predict any record as normal
np.unique(pred_forest)

## ExtraTreesClassifier Observation

Out intution was quite off and the model did even better than SVM.  The resuls are even more impressive with an F1 score of 0.94 (for Normal) and 0.98 (for Abnormal), this is setting the bar quite high so lets see if any other model can outperform it.

# Running Logistic Regression

Now we run the standard Logistic Regression model, our intuition is that it would perform well since it will take the data as is (without randomization) and just try to predict an outcome.  The results should be comparable to SVM

In [None]:
logistic = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
#clf_log = make_pipeline(pca, logistic)

logistic.fit(train_ptb, out_train_ptb)
pred_log = logistic.predict(valid_ptb)
print(classification_report(out_valid_ptb, pred_log, target_names=[PTB_Outcome[i] for i in PTB_Outcome]))

### Logistic Regression observation:

The Logistic Regression model so far has been the poorest performing model out of the 3 evaluated so far.


# Running TPOTClassifier to determine best algorithm
### This part of the code takes roughly 3.5-4 hours to run, it has been commented out to avoid the long run times.  
#The results are below, you can convert this cell from markdown to code to run it if desired

#===================    Begin Code here ======================================
from tpot import TPOTClassifier

tpot = TPOTClassifier (generations=5, population_size=40, verbosity=2, random_state=42, scoring='f1_micro')
tpot.fit(train_ptb, out_train_ptb)

#evaluate the classifier against the validation set
print(tpot.score(valid_ptb, out_valid_ptb))

#export the model to a file
tpot.export('PTB_Data_Classifier.py')

#===================   End Code here ==========================================

======================== OUTPUT =============================

Generation 1 - Current best internal CV score: 0.9667508842849925

Generation 2 - Current best internal CV score: 0.9710965133906013

Generation 3 - Current best internal CV score: 0.9751389590702374

Generation 4 - Current best internal CV score: 0.9755432036382011

Generation 5 - Current best internal CV score: 0.9780697321879737


Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=8, max_features=0.1, min_samples_leaf=4, min_samples_split=15, n_estimators=100, subsample=0.9000000000000001)

0.9773645917542442





================== END OF OUTPUT =============================

########  PTB_Data_Classifier.Py File content ########

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

#NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = train_test_split(features, tpot_data['target'], random_state=42)

#Average CV score on the training set was: 0.9780697321879737
exported_pipeline = GradientBoostingClassifier(learning_rate=0.5, max_depth=8, max_features=0.1, min_samples_leaf=4, min_samples_split=15, n_estimators=100, subsample=0.9000000000000001)

#Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)

results = exported_pipeline.predict(testing_features)


########  End of File content ########

# Running GradientBosstingClassifier

Finally lets run the GradientBosstingClassifier that AutoML has recommended for us and see what we observer.  We're expecting this to be our best performing model so far.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradboost = GradientBoostingClassifier(learning_rate=0.5, max_depth=8, max_features=0.1, 
                                       min_samples_leaf=4, min_samples_split=15, n_estimators=100, 
                                       subsample=0.9000000000000001)
gradboost.fit(train_ptb, out_train_ptb)
pred_gradboost = gradboost.predict(valid_ptb)

print(classification_report(out_valid_ptb, pred_gradboost, target_names=[PTB_Outcome[i] for i in PTB_Outcome]))

### GradientBoostingClassifier:

AutoML has recommeded a very high performing model. It would be very difficult to beat, Deep Learning might be able to produce a better performing model but would the minimal gains in results come with an acceptable performance hit?

# Analayzing the MIT Heartbeat Data
Now we repeat the same analysis with the different dataset and tweak accordingly

In [None]:
# Since the MIT dataset already comes as a train set and test set, we just split 20% of the training set for validation
train_mit, valid_mit, out_train_mit, out_valid_mit = train_test_split(mit_train.iloc[:,:187], mit_train.iloc[:,-1], test_size=0.20, random_state=42)

#we remove the targets from the test set
test_mit, out_test_mit = mit_test.iloc[:,:187], mit_test.iloc[:,-1]

#Normalizing the training & test data 
train_mit = normalize(train_mit, axis=0, norm='max')
valid_mit = normalize(valid_mit, axis=0, norm='max')
test_mit = normalize(test_ptb, axis=0, norm='max')

# Running SVM

We use a GridSearchCV to find the bets parameters for SVM model with F1 micro scoring (F1 score "weighted" based on the data, so that the imbalanced data doesn't skew it towards the more abundant class)

In [None]:
#Looking at the plots we can see that there are a lot "zero" values which will not likely help our classification.  Eyeballing the data I chose 100 features to keep.
svc = SVC(kernel='rbf', class_weight='balanced')

param_grid = {'C': [1, 5, 10]}
grid_svc = GridSearchCV (svc, param_grid, verbose=2, scoring='f1_micro')

# Train the grid of models. Time this process.
%time grid_svc.fit(train_mit, out_train_mit)

# Print the parameters which yield the best model performance
print (grid_svc.best_estimator_)

In [None]:
# Selecting the best parameters from the previos GridSearchCV and predicting values on our validation set.
svc = grid_svc.best_estimator_
pred_svc_mit = svc.predict(valid_mit)

print(classification_report(out_valid_mit, pred_svc_mit, target_names=[MIT_Outcome[i] for i in MIT_Outcome]))

# Running ExtraTreesClassifier

Considering that this is a time series dataset and how the prior value impacts the current value our intuition is that this model would perform badly as "randomly" selecting features and making decisions based on these values would make for an archiac model.

In [None]:
forest_mit = ExtraTreesClassifier (criterion='entropy', max_samples=10, class_weight='balanced', random_state=42)

param_grid = {'n_estimators': [10, 20, 30],
             'max_depth' : [5, 10, 15, 20]}
grid_forest_mit = GridSearchCV(forest_mit, param_grid, scoring='f1_micro', verbose=2)

grid_forest_mit.fit(train_mit, out_train_mit)

print(grid_forest_mit.best_params_)

In [None]:
# loading the best estimator from the GridSearchCV into our model
forest_mit = grid_forest_mit.best_estimator_

# predicting the outcome by using the best model
pred_forest_mit = forest_mit.predict(valid_mit)
print(classification_report(out_valid_mit, pred_forest_mit, target_names=[MIT_Outcome[i] for i in MIT_Outcome]))

# Running Logistic Regression
​
Now we run the standard Logistic Regression model, our intuition is that it would perform well since it will take the data as is (without randomization) and just try to predict an outcome.  The results should be comparable to SVM

In [None]:
logistic_mit = LogisticRegression(random_state=42, class_weight='balanced', max_iter=10000)
#clf_log = make_pipeline(pca, logistic)

logistic_mit.fit(train_mit, out_train_mit)
pred_log_mit = logistic_mit.predict(valid_mit)
print(classification_report(out_valid_mit, pred_log_mit, target_names=[MIT_Outcome[i] for i in MIT_Outcome]))

# Running GradientBoostingClassifier

This time we're not going to run AutoML as its very time consuming and specially with 5 classes so we're just going to run it as the PTB dataset and see how it fares.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradboost_mit = GradientBoostingClassifier(learning_rate=0.5, max_depth=8, max_features=0.1, 
                                       min_samples_leaf=4, min_samples_split=15, n_estimators=100, 
                                       subsample=0.9000000000000001)
gradboost_mit.fit(train_mit, out_train_mit)
pred_gradboost_mit = gradboost_mit.predict(valid_mit)

print(classification_report(out_valid_mit, pred_gradboost_mit, target_names=[MIT_Outcome[i] for i in MIT_Outcome]))