# Preprocessing The Data

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [13]:
file_path_y =  r"C:\Users\noorh\OneDrive\Desktop\Grad Project\HMC Dataset\A4C.xlsx"
df_y = pd.read_excel(file_path_y)

# Convert 'non-MI' to 0 and 'MI' to 1 in the SEG columns
for col in ['SEG1', 'SEG2', 'SEG3', 'SEG5', 'SEG6', 'SEG7']:
    df_y[col] = df_y[col].apply(lambda x: 0 if x == 'non-MI' else 1 if x == 'MI' else x)

# Load the motion feature vectors
file_path_x = r"C:\Users\noorh\Echocardiograms\Final Submission\motion_features.xlsx"
df_x = pd.read_excel(file_path_x)

# Remove the "Mask_" prefix from 'Sub-directory Name' to match with ECHO in df_y
#df_x['subdirectory'] = df_x['subdirectory'].str.replace('Mask_', '')


merged_df = pd.merge(df_y, df_x, left_on='ECHO', right_on='subdirectory')

X_columns = ['segment1', 'segment2', 'segment3', 'segment5', 'segment6', 'segment7']
y_columns = ['SEG1', 'SEG2', 'SEG3', 'SEG5', 'SEG6', 'SEG7']

X = merged_df[X_columns]
y = merged_df[y_columns]

# Create a single binary target column
y['target'] = y.max(axis=1)

# Normalize the features to unity
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y['target'], test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['target'] = y.max(axis=1)


In [14]:
def calculate_metrics(y_test, y_pred):
    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp) 
    precision = tp / (tp + fp) 
    f1 = f1_score(y_test, y_pred)

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Sensitivity (Recall):", sensitivity)
    print("Specificity:", specificity)
    print("F1 Score:", f1)

# Decision Tree Classifier

In [15]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50]
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

Best parameters found:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}


In [16]:
calculate_metrics(y_test, y_pred_best)

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.71      0.71         7
           1       0.86      0.86      0.86        14

    accuracy                           0.81        21
   macro avg       0.79      0.79      0.79        21
weighted avg       0.81      0.81      0.81        21

Confusion Matrix:
 [[ 5  2]
 [ 2 12]]
Accuracy: 0.8095238095238095
Precision: 0.8571428571428571
Sensitivity (Recall): 0.8571428571428571
Specificity: 0.7142857142857143
F1 Score: 0.8571428571428571


# SVM Classifier

In [17]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

clf = SVC(random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

Best parameters found:  {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}


In [18]:
calculate_metrics(y_test, y_pred_best)

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.86      0.80         7
           1       0.92      0.86      0.89        14

    accuracy                           0.86        21
   macro avg       0.84      0.86      0.84        21
weighted avg       0.87      0.86      0.86        21

Confusion Matrix:
 [[ 6  1]
 [ 2 12]]
Accuracy: 0.8571428571428571
Precision: 0.9230769230769231
Sensitivity (Recall): 0.8571428571428571
Specificity: 0.8571428571428571
F1 Score: 0.888888888888889


# Logistic Regression Classifier

In [19]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

clf = LogisticRegression(random_state=42, max_iter=10000)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)



Best parameters found:  {'C': 0.1, 'penalty': 'none', 'solver': 'newton-cg'}


225 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\noorh\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\noorh\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\noorh\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

----------------------

In [20]:
calculate_metrics(y_test, y_pred_best)

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.86      0.80         7
           1       0.92      0.86      0.89        14

    accuracy                           0.86        21
   macro avg       0.84      0.86      0.84        21
weighted avg       0.87      0.86      0.86        21

Confusion Matrix:
 [[ 6  1]
 [ 2 12]]
Accuracy: 0.8571428571428571
Precision: 0.9230769230769231
Sensitivity (Recall): 0.8571428571428571
Specificity: 0.8571428571428571
F1 Score: 0.888888888888889


# KNN Classifier 

In [21]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

clf = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

Best parameters found:  {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

In [22]:
calculate_metrics(y_test, y_pred_best)

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.43      0.46         7
           1       0.73      0.79      0.76        14

    accuracy                           0.67        21
   macro avg       0.62      0.61      0.61        21
weighted avg       0.66      0.67      0.66        21

Confusion Matrix:
 [[ 3  4]
 [ 3 11]]
Accuracy: 0.6666666666666666
Precision: 0.7333333333333333
Sensitivity (Recall): 0.7857142857142857
Specificity: 0.42857142857142855
F1 Score: 0.7586206896551724


# Random Forest Classifier

In [23]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

Best parameters found:  {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [24]:
calculate_metrics(y_test, y_pred_best)

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.71      0.77         7
           1       0.87      0.93      0.90        14

    accuracy                           0.86        21
   macro avg       0.85      0.82      0.83        21
weighted avg       0.86      0.86      0.85        21

Confusion Matrix:
 [[ 5  2]
 [ 1 13]]
Accuracy: 0.8571428571428571
Precision: 0.8666666666666667
Sensitivity (Recall): 0.9285714285714286
Specificity: 0.7142857142857143
F1 Score: 0.896551724137931
