In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns

In [30]:
BASE_PATH = "../results/"

In [31]:
df = pd.read_csv(f"{BASE_PATH}ecg.csv")
df = df.sample(frac = 1)

In [32]:
# create an instance of LabelEncoder
le = LabelEncoder()

# fit and transform the "Senior Management" 
# column with LabelEncoder
df['Gender'] = le.fit_transform(df['Gender'])
df.head()

Unnamed: 0,Name,Age,Gender,Mean_RR,STD_RR,RMS_RR,Mean_HR,STD_HR,RMSSD,Status
40,Rammurthy,33,1,755.460808,51.654792,757.224702,79.421724,5.83285,39.456187,1
129,Ragavendra,36,1,605.54937,9.944964,605.631028,99.083581,1.634651,3.58896,1
98,Harini M B,34,0,817.103964,91.868452,822.252213,73.430068,6.642448,99.751847,0
119,Manjula,33,0,822.864547,129.624705,833.01178,72.916011,5.807257,116.692757,0
127,Ramesh,23,1,678.353996,32.847672,679.148815,88.449394,4.16371,13.140582,0


In [33]:
X = df.drop(['Name', 'Age', 'Gender', 'Status'], axis=1).copy()
y = df['Status'].copy()

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=66)

In [35]:
params = {
    "ccp_alpha": [0.01, 0.05, 0.1, 0.5, 0.9, 1, 5, 9, 10, 50, 100],
    'max_depth': [None, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'n_estimators': [50, 80, 100, 200, 300, 400, 500],
    # 'criterion': ["gini", "entropy"],
    # 'min_samples_split': [2, 5, 10, 20, 50],
    # 'min_samples_leaf': [1, 5, 10, 20, 50],
    # 'max_leaf_nodes': [None, 10, 20, 50, 100, 200, 500, 1000],
    # 'max_samples': [None, 0.1, 0.5, 1.0],
}

In [None]:
rfc = RandomForestClassifier()
rfcCV = GridSearchCV(estimator=rfc, param_grid=params, scoring='neg_mean_absolute_error', cv=2, return_train_score=True, verbose=1, n_jobs=-1)
rfcCV.fit(X_train, y_train)

In [None]:
opt_alpha_random_forest = rfcCV.best_params_['ccp_alpha']
print(f"Optimum Alpha value: {opt_alpha_random_forest}")

In [39]:
clf = RandomForestClassifier(max_depth=6, random_state=1, ccp_alpha=5)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

clf.fit(X_train, y_train)

y_pred_test = clf.predict(X_test)

Cross-Validation Scores: [0.63157895 0.83333333 0.61111111 0.72222222 0.77777778]
Mean Accuracy: 0.7152046783625731


In [None]:
from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, 'trained_model.joblib')

In [None]:
print(f"Accuracy: {int(accuracy_score(y_test, y_pred_test)*100)}%")

In [None]:
confusion_matrix(y_test, y_pred_test)

In [None]:
confusion_matrix(y_test, y_pred_test)
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(10,5))
sns.set(font_scale=1)
sns.heatmap(matrix, annot=True, annot_kws={'size':16},
            cmap=plt.cm.Blues, linewidths=0.2)

# Add labels to the plot
class_names = ['Alcoholic', 'Non-Alcoholic']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=10)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()