# Binary Task Notebook

### Contains Code for Binary task and models created

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

#tqdm is for progress bar functionality in code, must be installed for code to function (TO DO: include exception if tqdm not imported )
from tqdm import tqdm

Using TensorFlow backend.


# 1. Loading Dataset and doing final preprocessing
### We Load the preprocessed data and carry out PCA on the image array here for the binary training and test data
### 1.1 Loading Datasets

In [40]:
#We can do PCA for the images but must be done separately for binary and multiclass task as the data must be split first
#This is because we must do PCA on the training data only (fit and transform it) and then only use the transform on the test data to prevent bias
#We select 400 components as it provides around 96% explained variance (can get the exact value)

#Reading created pkl files for binary labels and image data.
Binary_labels = pd.read_pickle('./dataset/Y_Binary_label.pkl')
Flattened_MRI_Array = pd.read_pickle('.\dataset\Image_DF_Flat.pkl')

#For Display
print(Binary_labels)
Flattened_MRI_Array


      MRI_Binary_Label
0                  1.0
1                  0.0
2                  1.0
3                  1.0
4                  1.0
...                ...
2995               0.0
2996               1.0
2997               1.0
2998               1.0
2999               1.0

[3000 rows x 1 columns]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,1,1,2,2,3,3,3,3,3,2,...,40,67,63,29,44,62,3,3,3,3
1,3,2,2,2,2,2,2,2,2,2,...,1,1,0,1,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,48,44,46,33,43,38,47,34,1,0
3,0,0,0,0,1,1,1,2,1,2,...,2,2,2,2,3,4,24,26,5,2
4,1,1,1,1,1,0,0,6,1,20,...,157,112,183,133,43,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,5,1,1,3,1,0,5,6,6,3,...,1,1,1,2,3,4,1,2,0,3
2996,1,1,2,2,2,2,3,2,3,3,...,72,70,42,9,4,4,2,4,2,2
2997,0,3,2,2,2,2,2,2,2,2,...,27,21,27,14,4,3,2,1,3,0
2998,2,2,2,2,3,3,3,3,4,3,...,153,63,86,81,59,71,26,7,5,3


In [35]:
#Taking just the label portion for editing into our Target Y array
Y = Binary_labels[['MRI_Binary_Label']]
print(Y.shape)

X = Flattened_MRI_Array
print(X.shape)

(3000, 1)
(3000, 784)


### 1.2 Splitting data in to training and testing sets

In [36]:
# Split the data into training and testing(70% training and 30% testing data)
xTrain,xTest,yTrain,yTest=train_test_split(X, Y, train_size = 0.7)

#Rescaling the dataframe as the pixel values range from 0 to 255
#We want it to be between 0 to 1 to let it pass through the NN and models
xTrain_Scaled = xTrain/255
xTest_Scaled = xTest/255

### 1.3 PCA

In [43]:
#Initialising PCA with 400 components determined in preprocessing notebook
Binary_PCA = PCA(n_components = 400)

#Fitting and Transforming training dataset
xTrain_PCA = Binary_PCA.fit_transform(xTrain_Scaled)

#We only transform test dataset as we do not want the model to learn about the test data statistics
xTest_transformed = Binary_PCA.transform(xTest_Scaled)

#Prints the percentage of explained variance to verify it is greater than our threshold of 95%
print(np.cumsum(Binary_PCA.explained_variance_ratio_ * 100)[-1])

96.45328975272359


# 2. Model Building