# 2. Model Building and Evaluation

### 0. Imports

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import cv2 as cv
from utils import *
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import xgboost

### 1. Load in Processed Images

In [None]:
cwd = os.getcwd()
data_path = os.path.join(cwd,'standardized_data/Training')

X = []
y = []

for label in os.listdir(data_path):
    for image in os.listdir(os.path.join(data_path,label)):
        image_path = os.path.join(data_path,label,image)
        load_image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)
        # flatten_image = load_image.flatten()
        X.append(load_image)
        y.append(label)


# Convert to Numpy Array
X = np.array(X)
y = np.array(y)
le = LabelEncoder()
le.fit(y) 
y = le.transform(y)

print(X.shape)
print(y.shape)

print(X[0])
print(y[0])

### 2. Base Model (most common class classifier)

In [None]:
plt.figure(figsize=(5,5))
y_train_series = pd.Series(y)
y_train_series.value_counts().plot(kind='bar')
plt.title('Distribution of Cancer Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()
print(le.classes_)
y_transformed = le.inverse_transform(y)

plt.figure(figsize=(5,5))
y_train_series = pd.Series(y_transformed)
y_train_series.value_counts().plot(kind='bar')
plt.title('Distribution of Cancer Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()


# Generate Model Statistics
most_common_label = stats.mode(y)[0]
print("Most Common Label: " + str(most_common_label))
y_predict = np.ones((y.shape)) * most_common_label
print(y_predict.shape)
print(y.shape)

# Generate Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y, y_predict)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = le.classes_)
cm_display.plot()
plt.show()

accuracy_score = metrics.accuracy_score(y, y_predict)
macro_precision = metrics.precision_score(y,y_predict,average ='macro')
macro_recall = metrics.recall_score(y,y_predict,average='macro')
macro_f1 = metrics.f1_score(y,y_predict,average='macro')
micro_precision = metrics.precision_score(y,y_predict,average='micro')
micro_recall = metrics.recall_score(y,y_predict,average='micro')
micro_f1 = metrics.f1_score(y,y_predict,average='micro')

print("================== TRAINING METRICS ===================")
print("Accuracy Score: " + str(accuracy_score))
print("Macro Precision: " + str(macro_precision))
print("Macro Recall: " + str(macro_recall))
print("Macro F1: " + str(macro_f1))
print("Micro Precision: " + str(micro_precision))
print("Micro Recall: " + str(micro_recall))
print("Micro F1: " + str(micro_f1))

### 3. Extract Features (Canny Edges, Difference of Gaussians, Complex Feature)

In [None]:
# ### TBD
# X_dog_features = get_features(X, feat_name='blob_dog')
# X_doh_features = get_features(X, feat_name='blob_doh')
# X_canny_features = get_features(X, feat_name='canny')
# pca = PCA(n_components=1000, svd_solver="randomized", whiten=True).fit(X_canny_features)
# X_pca_canny_features = pca.transform(X_canny_features)
# # X_complex_features = get_features(X, feat_name='complex')

# np.save('X_train_dog_features.npy', X_dog_features)
# np.save('X_train_doh_features.npy', X_doh_features)
# np.save('X_train_pca_canny_features.npy', X_pca_canny_features)
# # np.save('X_train_complex_features.npy', X_complex_features)
# np.save('Y_train.npy',y)
# y_raw_train = le.inverse_transform(y)
# np.save('Y_raw_train.npy',y_raw_train)
# print(y.shape)
# print(y_raw_train.shape)

### 3a. Load Features

In [None]:
X_train_dog_features = np.load('X_train_dog_features.npy')
X_train_doh_features = np.load('X_train_doh_features.npy')
X_train_pca_canny_features = np.load('X_train_pca_canny_features.npy')
bundle = get_features(None, "complex", joblib_path="complex_feat_training.joblib", return_bundle=True)
X_train_complex_features = bundle.X
y_train = np.load('Y_train.npy')

### 3a. PCA Variance Vectors

In [None]:
# x = plot_PCA([X_dog_features, X_doh_features, X_canny_features, X_complex_features], n_components=[min(X_dog_features.shape), min(X_doh_features.shape), 1000, min(X_complex_features.shape)])


### 4. Shuffle Data

In [None]:
np.random.seed(281)

# Create a list of indexes that is the length of the number of training examples
indices = [i for i in range(0,X.shape[0])]
shuffle_indices = np.random.permutation(indices)

# Reorder X and Y based on the shuffled indices
# X = X[shuffle_indices]
# y = y[shuffle_indices]

X_dog_features = X_train_dog_features[shuffle_indices]
X_doh_features = X_train_doh_features[shuffle_indices]
X_canny_features = X_train_pca_canny_features[shuffle_indices]
X_complex_features = X_train_complex_features[shuffle_indices]

y_train = y_train[shuffle_indices]

In [None]:
print(X_dog_features.shape)
print(X_dog_features.shape)
print(X_canny_features.shape)
print(X_complex_features.shape)

### 5. Model Training

In [None]:
model_types = ['logistic','svm','rf','lda','qda']
features = {'dog':X_dog_features,
            'doh':X_doh_features,
            'canny':X_canny_features,
            'complex':X_complex_features}

all_results = pd.DataFrame(columns = ['feature', 'model_type', 'accuracy_score', 'macro_precision', 'macro_recall', 'macro_f1', 
                                      'micro_precision', 'micro_recall', 'micro_f1', 'training_time'])
model_store = {}

for model_type in model_types:
    for feature in features.keys():
        model, results = train_model(features[feature], y_train, classes=le.classes_, model_type=model_type, feature=feature)
        all_results.loc[len(all_results)] = results
        model_store[str(model_type) + "_" + str(feature)] = model
    save_models(model_store, "models")

print(all_results)



In [None]:
all_results.to_csv('training_results.csv')

### 5a. Load Models

In [None]:
model_types = ['logistic','svm','rf','lda','qda']
features = {'dog':X_dog_features,
            'doh':X_doh_features,
            'canny':X_canny_features,
            'complex':X_complex_features}

loaded_models = {}

for model_type in model_types:
    for feature in features.keys():
        feature_model = str(model_type) + "_" + str(feature)
        try:
            loaded_models[feature_model] = joblib.load("models//" + str(feature_model) + ".joblib")
        except:
            print("Could not load " + str(feature_model))

print(loaded_models.keys())

### 6. Final Test Evaluation

In [None]:
X_test_canny = np.load('X_test_canny.npy')
X_test_doh = np.load('X_test_doh.npy')
X_test_dog = np.load('X_test_dog.npy')
bundle = get_features(None, "complex", joblib_path="complex_feat_testing.joblib", return_bundle=True)
X_test_complex = bundle.X
Y_test = np.load('y_test.npy')


all_test_results= pd.DataFrame(columns = ['feature', 'model_type', 'accuracy_score', 'macro_precision', 'macro_recall', 'macro_f1', 
                                      'micro_precision', 'micro_recall', 'micro_f1', 'inference_time'])

for model in loaded_models.keys():
    if 'canny' in model:
        X_test = X_test_canny
    elif 'dog' in model:
        X_test = X_test_dog
    elif 'doh' in model:
        X_test = X_test_doh
    elif 'complex' in model:
        X_test = X_test_complex

    feature = model.split('_')[-1]
    model_type = model.split('_')[0]

    result_dict = test_model(model_store[model], X_test, Y_test, le.classes_, model_type=str(model_type), feature=str(feature))
    all_test_results.loc[len(all_test_results)] = result_dict

all_test_results.to_csv("test_results.csv")
