# Functions File

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import pickle as pkl
from numpy import expand_dims
import os
import cv2
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from sklearn.decomposition import PCA

#tqdm is for progress bar functionality in code, must be installed for code to function
from tqdm import tqdm

#Importing libraries used for SVM classification and model assessment
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

#Libraries for CNN model
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [5]:
#Function which takes in file path and image size arguements and extracts the images from the given path and resizes them to the determined size (square dimensions)
def image_array_resize(file_path,IMG_SIZE):
    #Initialises empty list for population through for loop
    image_data = []

    #For loop will run through all items in the directory listed, in this case the image folder containing all 3000 mri images in our dataset.
    #x in this case will print out the full filename (in this case the name of each IMAGE_xxxx.jpg), therefore we can use it to generate paths to 
    #All the images via the for loop.

    #tqdm just gives a progress bar for image extraction process
    #listdir counts total number of items in the designated folder, in this case the image folder with all the original images
    for x in tqdm(os.listdir(file_path)):
        #Creates path to images per iteration
        image_path = os.path.join(file_path,x)

        #Reads the corresponding image using cv2.imread
        file_array = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        
        #Resizes images based on the value of IMG_SIZE
        resize_file_array = cv2.resize(file_array, (IMG_SIZE,IMG_SIZE))
        
        #Stores the image array information into the image_data list created. Append makes sure it adds information in order.
        image_data.append(resize_file_array)

        
        #CODE FOR DEBUGGING
        #print(x)
        #plt.imshow(file_array, cmap='gray')  #graph it
        #plt.show()  #display! 
        #break

    #Converts list into numpy array for easier processing
    image_data_array = np.array(image_data)   
    print("Successfully extracted original Images from dataset!")
    return image_data_array

In [2]:
#Loading Dataset function
def load_dataset(data_file_path, label_file_path):
    #Reading created pkl files for labels and image data using file path inputs
    Target_labels = pd.read_pickle(label_file_path)
    Data_samples = pd.read_pickle(data_file_path)

    #Turning it into X and Y arrays, X being the input data and Y being the corresponding labels
    X = Data_samples
    #Taking just the label portion for editing into our Target Y array
    Y = Target_labels.iloc[:, 0]


    #For Display, Returns the shape of the resultant arrays for reference and confirmation it worked
    #In coursework should return Y of 3000 labels and X with 3000 samples with 784 features
    print("Datasets successfully loaded with shapes:")
    print("Y Shape:")
    print(Y.shape)
    print("X Shape:")
    print(X.shape)

    return X, Y

In [1]:
#load Dataset for CNN

def load_dataset_CNN(data_file_path, label_file_path):
    #Reading created pkl files for labels and image data using file path inputs
    Target_labels = pd.read_pickle(label_file_path)
    MRI_2D_imgs = pd.read_pickle(data_file_path)

    #Reshaping the 2D_MRI array images back into 3D Array
    MRI_2D_img_array = np.array(MRI_2D_imgs)
    MRI_img_array = MRI_2D_img_array.reshape(MRI_2D_img_array.shape[0],50,50)

    #Adding 4th channel to array (for convnet fitting)
    #The last channel is indicating whether it is a RGB channel (3) or grayscale (1) image
    MRI_img_array_channel = MRI_img_array.reshape(3000,50,50,1)

    #Getting our Y and X inputs for the model and scaling the X inputs
    #Carrying out scaling of the pixel data per element so that it is between 0 and 1
    X = MRI_img_array_channel/255
    #Taking just the label portion for editing into our Target Y array
    Y = Target_labels.iloc[:, 0]

    
    #For Display, Returns the shape of the resultant arrays for reference and confirmation it worked
    #In coursework should return Y of 3000 labels and X with 4 Dimensions: 3000, 100, 100, 1
    #Verifying the new array is 4D
    print("Datasets successfully loaded with shapes:")
   
    print("X Shape:")
    print(X.shape)

    print("Y Shape:")
    print(Y.shape)
    
    return X, Y

    

In [4]:
#PCA

def dataset_PCA(n_components, xTrain, xTest):

    #Initialising PCA with the target number of components from input
    n_PCA = PCA(n_components= n_components)

    #Fitting and Transforming training dataset provided, this prevents model from learning about test data statistics
    xTrain_PCA = n_PCA.fit_transform(xTrain)

    #Transforming the test dataset provided
    #We only transform test dataset as we do not want the model to learn about the test data statistics
    xTest_transformed = n_PCA.transform(xTest)

    #Prints the percentage of explained variance to verify it is greater than our threshold of 95%
    #print(np.cumsum(n_PCA.explained_variance_ratio_ * 100)[-1])

    #Function complete message
    #Shows number of components for PCA 
    print("PCA conducted with " + str(n_components) + " components.")

    #Prints the percentage of explained variance to verify it is greater than our threshold of 95%
    print("The percentage of Explained Variance of the dataset from PCA is: " + str(np.cumsum(n_PCA.explained_variance_ratio_ * 100)[-1]))

    return xTrain_PCA, xTest_transformed

In [6]:
#Tuned SVM training
#Function which fits SVM model using gridsearch based on parameter grid inputted, it also returns the best hyperparameters found during gridsearch and the details of the new model
# Input Arguements:
# param_grid = parameter grid containing hyperparameter values to do a gridsearch with
# cv = number of cross validation folds
# xTrain = X training data
# yTrain = Y training data labels
# probability = whether we want to enable probability estimates, True for multiclass, false for binary
def Tuned_SVM_train(param_grid, cv, xTrain, yTrain, probability):

    #Initialises new SVM model which will conduct a gridsearch through the given parameters
    SVM_grid = GridSearchCV(SVC(probability = probability), param_grid, refit = True, verbose = 1, cv = cv)

    #Fitting model with grid search based on our training dataset
    SVM_grid.fit(xTrain, yTrain.values.ravel())

    #Confirmation Message
    print("Tuned SVM Model successfully trained and tuned")

    #Display the best parameters after the hyperparameter tuning
    print("The best hyperparameters found by gridsearch are:")
    print(SVM_grid.best_params_)

    #Print the new details of the SVM model after tuning
    print("The new model created after hyperparameter tuning is:")
    print(SVM_grid.best_estimator_)

    return SVM_grid

In [7]:
#SVM predictions
#Function which takes in SVM model and test datasets and runs predictions. 
#Returns the performance of the model with the classification_report, confusion matrix as well as the predictions of the model

def SVM_predictions(model, xTest, yTest):
    #Printing prediction results
    SVM_pred = model.predict(xTest)
    print("The Results for SVM are:")
    print(classification_report(yTest, SVM_pred))

    #Printing the confusion matrix for SVM
    print("The confusion matrix is:")
    print(confusion_matrix(yTest, SVM_pred))

    return SVM_pred
