In [1]:
# import essential libraries
import pandas as pd
import numpy as np
import pickle
import scipy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, f1_score
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import svm, linear_model
from sklearn import tree, metrics
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import lightgbm
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier, cv, Pool
import gzip

In [2]:
df = pd.read_csv('training_13_features.csv')
new_df = df.dropna()
# new_df = new_df[new_df['ritmi'] != 2]
new_df = new_df.reset_index(drop=True)
df.head()

Unnamed: 0,ritmi,age,sex,height,weight,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,2,54.0,0,166.796356,69.841845,0.0,0.0,0,3.0,0.0,0,0,0.0,6
1,1,54.0,0,166.796356,69.841845,0.0,0.0,0,3.0,0.0,0,0,0.0,6
2,0,55.0,0,166.796356,69.841845,1.0,2.0,1,1.0,1.0,0,1,0.0,10
3,2,29.0,1,164.0,56.0,7.0,1.0,10,0.0,0.0,0,1,0.0,1
4,2,57.0,0,166.796356,69.841845,0.0,0.0,0,3.0,0.0,0,0,0.0,1


In [3]:
# convert all columns' types to float64
for i in range(14):
    new_df[new_df.columns[i]] = new_df[new_df.columns[i]].astype('float64')
    
# get info for columns
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6366 entries, 0 to 6365
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ritmi               6366 non-null   float64
 1   age                 6366 non-null   float64
 2   sex                 6366 non-null   float64
 3   height              6366 non-null   float64
 4   weight              6366 non-null   float64
 5   nurse               6366 non-null   float64
 6   site                6366 non-null   float64
 7   device              6366 non-null   float64
 8   heart_axis          6366 non-null   float64
 9   validated_by        6366 non-null   float64
 10  second_opinion      6366 non-null   float64
 11  validated_by_human  6366 non-null   float64
 12  pacemaker           6366 non-null   float64
 13  strat_fold          6366 non-null   float64
dtypes: float64(14)
memory usage: 696.4 KB


In [4]:
# train-test split
X = new_df.drop(columns='ritmi')
y = new_df['ritmi']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.2, random_state = 246)

In [5]:
# Random Forest

In [6]:
# Plug in appropriate max_depth and random_state parameters
rf = RandomForestClassifier()
rf_param_grid = {'n_estimators': [600], 'criterion': ['entropy'], 'max_depth': [60]} #0.502161524857536
rf_cv= GridSearchCV(rf,rf_param_grid,cv=7,n_jobs=-1)
rf_cv.fit(X_train,y_train)

print("Best Score:" + str(rf_cv.best_score_))
print("Best Parameters: " + str(rf_cv.best_params_))

Best Score:0.5015717481575214
Best Parameters: {'criterion': 'entropy', 'max_depth': 60, 'n_estimators': 600}


In [7]:
y_pred = rf_cv.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.42      0.37      0.39       407
         1.0       0.52      0.49      0.50       318
         2.0       0.53      0.60      0.56       549

    accuracy                           0.50      1274
   macro avg       0.49      0.49      0.49      1274
weighted avg       0.49      0.50      0.49      1274



In [8]:
# K-Neighbors

In [9]:
clfl2 = KNeighborsClassifier()
parameters = {'n_neighbors': [140], 'weights': ['distance'], 'metric': ['euclidean']} #0.4830258302583026

fitmodel = GridSearchCV(clfl2, param_grid=parameters, cv=5, refit=True, scoring="accuracy", n_jobs=-1, verbose=2)
fitmodel.fit(X_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
KNeighborsClassifier(metric='euclidean', n_neighbors=140, weights='distance') {'metric': 'euclidean', 'n_neighbors': 140, 'weights': 'distance'} 0.4823254047363357


In [10]:
y_pred = fitmodel.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.38      0.24      0.30       407
         1.0       0.55      0.41      0.47       318
         2.0       0.49      0.70      0.58       549

    accuracy                           0.48      1274
   macro avg       0.47      0.45      0.45      1274
weighted avg       0.47      0.48      0.46      1274

