In [1]:
!pip install hiclass

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hiclass
  Downloading hiclass-4.0.11-py3-none-any.whl (23 kB)
Collecting ray
  Downloading ray-1.13.0-cp37-cp37m-manylinux2014_x86_64.whl (54.5 MB)
[K     |████████████████████████████████| 54.5 MB 216 kB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 35.7 MB/s 
Collecting virtualenv
  Downloading virtualenv-20.15.0-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 42.8 MB/s 
Collecting aiosignal
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting frozenlist
  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[K     |████████████████████████████████| 144 kB 60.1 MB/s 
Collecting distlib<1,>=0.3.1
  Downloading di

In [2]:
pip install patchify

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting patchify
  Downloading patchify-0.2.3-py3-none-any.whl (6.6 kB)
Installing collected packages: patchify
Successfully installed patchify-0.2.3


In [3]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2
import os
import seaborn as sns
import pandas as pd
from skimage.filters import sobel
from pathlib import Path
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import random
from hiclass import LocalClassifierPerNode
from hiclass import LocalClassifierPerParentNode
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from patchify import patchify
from skimage.measure import shannon_entropy,moments
from scipy.stats import kurtosis, skew
from sklearn.metrics import accuracy_score, auc, roc_curve, roc_auc_score, recall_score, confusion_matrix

In [4]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


### **A Function to Perform 2D Filter with Gabor Kernel and Return Local Statistics from Patches of Filtered Images**

In [5]:
# Input dim is (n, x, y, c): number of images, x, y, and channels
def gabor_feature_extractor(dataset):
    x_train = dataset
    image_dataset = pd.DataFrame()
    for image in range(x_train.shape[0]): # Iterate through each file 
        df = pd.DataFrame() # Temporary data frame to capture information for each loop.
        img = x_train[image]

        # Bunch of Gabor filter responses
        num = 1  # To count up in order to give gabor features a label in the data frame
        for theta in range(2): 
                theta = theta / 4. * np.pi
                sigma = 1
                lamda = np.pi/4
                gamma = 0.5
                gabor_label = 'Gabor' + str(num) # Label Gabor columns as Gabor1, Gabor2, etc.
                ksize=9
                kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F)    

                # Filter the image and add values to a new column 
                fimg = cv2.filter2D(img, cv2.CV_8UC3, kernel)
                patches_img = patchify(fimg, (35, 23,3), step=35)
                vec_entropy  = []
                vec_mean  = []
                vec_std  = []
                vec_skew  = []
                vec_kurtosis  = []
                vec_centroidx = []
                vec_centroidy = []
                for i in range(patches_img.shape[0]):
                  for j in range(patches_img.shape[1]):
                    single_patch_img = patches_img[i,j,:,:]
                    entropy_label = 'Entropy' + str(num)
                    mean_label = 'Mean' + str(num)
                    std_label = 'Std' + str(num)
                    skew_label = 'Skew' + str(num)
                    kurtosis_label = 'Kurtosis' + str(num)
                    entropy = shannon_entropy(single_patch_img)
                    vec_entropy.append(entropy)
                    k = kurtosis(single_patch_img, axis=None)
                    s = skew(single_patch_img, axis=None, bias=True)
                    vec_skew.append(s)
                    vec_kurtosis.append(k)
                    mean, std = cv2.meanStdDev(single_patch_img)
                    mean = mean[0][0]
                    std = std[0][0]
                    vec_mean.append(mean)
                    vec_std.append(std)
                
                df[entropy_label] = vec_entropy
                df[mean_label] = vec_mean
                df[std_label] = vec_std
                df[skew_label] = vec_skew
                df[kurtosis_label] = vec_kurtosis
                
                num += 1  # Increment for gabor column label
        
        image_dataset = image_dataset.append(df)
    return image_dataset

### **Load Images**

In [6]:
image_dir = "/gdrive/My Drive/Project/Denoised_CLAHE_Cl3"

In [7]:
def load_data(tag='train'):
  tag_dir = os.path.join(image_dir, tag)
  tag_path = Path(tag_dir)
  data = [] # Images
  cat = [] # Category
  subcat = []
  patient = []
  file_name = []
  for mag_dir in tag_path.iterdir():  
    mag_label = mag_dir.stem
    print("*",mag_label)
    if mag_label == "200X":
      print("Got it!")
      for img_name in mag_dir.glob('*.png'):
        img_label = img_name.stem
        splitted_image_name = img_label.split('_')
        cat_label = splitted_image_name[1]
        remaining_part = splitted_image_name[2].split('-')
        subcat_label = remaining_part[0]
        patient_label = remaining_part[2]
        img = cv2.imread(img_name.as_posix(), cv2.IMREAD_COLOR)
        hi_label = [cat_label, subcat_label]
        data.append(img) # append the image to the data
        cat.append(hi_label) # append the label to the category
        subcat.append(subcat_label)
        patient.append(patient_label)
        file_name.append(img_label)
  return data, cat, subcat, patient, file_name

In [8]:
# train images
vec_train, cat_train, subcat_train, patient_train, image_name_train = load_data('train')
# test images
vec_test, cat_test, subcat_test, patient_test, image_name_test = load_data('test')

* 40X
* 100X
* 200X
Got it!
* 400X
* 40X
* 100X
* 200X
Got it!
* 400X


In [9]:
# Convert images into numpy array
train_images = np.array(vec_train)
test_images = np.array(vec_test)

  


In [10]:
# Assigning data to the meaningful convention 
x_train, x_test = train_images, test_images

In [11]:
# Normalize pixel values to between 0 and 1
x_train, x_test = x_train / 255.0, x_test / 255.0

In [12]:
x_train[0].shape

(460, 700, 3)

### **Get Local Statistics from Patches of Gabor Filtered Images**

In [13]:
#Extract features from training images
train_features = gabor_feature_extractor(x_train)

# Reshape to a vector for ML
train_features = np.expand_dims(train_features, axis=0)
train_for_ML = np.reshape(train_features, (x_train.shape[0], -1))  # Reshape to [#images, #features]

In [14]:
#Extract features from test images and reshape (just like training data)
test_features = gabor_feature_extractor(x_test)
test_features = np.expand_dims(test_features, axis=0)
test_for_ML = np.reshape(test_features, (x_test.shape[0], -1))

In [15]:
train_features.shape

(1, 333840, 10)

In [16]:
train_for_ML.shape

(1284, 2600)

### **Function to Compute Patient Recognition Rate**

In [17]:
def evaluate_recognition_rate(prediction):
  unique_patient_test = list(np.unique(np.array(patient_test)))
  num_images_per_patient = []
  num_correctly_classified = []
  num = 0
  total_patient_score = 0
  num_patient = 0
  for patient in unique_patient_test:
    num_patient = num_patient + 1
    Np = 0
    Nrec = 0
    image_index_counter = -1
    for image_name in image_name_test:
      image_index_counter = image_index_counter + 1
      image_name_splitted = image_name.split('-')
      if patient == image_name_splitted[2]:
        first_part_splitted = image_name_splitted[0].split('_')
        image_class = first_part_splitted[2]
        if image_class == prediction[image_index_counter]:
          Nrec = Nrec + 1
        Np = Np + 1
    patient_score = Nrec/Np
    #print(patient_score)
    num_images_per_patient.append(Np)
    num_correctly_classified.append(Nrec)
    total_patient_score = total_patient_score + patient_score
    num = num + Np    
  recognition_rate = total_patient_score/num_patient
  print("Summation of patient score: ", total_patient_score)
  print("Total Number of Patients: ", num_patient)
  print("Recognition Rate: ", recognition_rate)

### **Function to Identify Predicted Classes and Subclasses**

In [18]:
def return_predicted_categories(predictions):
  pred_column1 = []
  pred_column2 = []
  cat_test_column1 = []
  cat_test_column2 = []

  n = 0
  for row in predictions:
    pred_column1.append(predictions[n][0])
    pred_column2.append(predictions[n][1])
    cat_test_column1.append(cat_test[n][0])
    cat_test_column2.append(cat_test[n][1])
    n = n+1

  return pred_column1, pred_column2, cat_test_column1, cat_test_column2

### **SVM Fit**

In [19]:
pipe1 = Pipeline([
                 ("scale", StandardScaler()),
                 ("classifier", SVC(class_weight='balanced', probability=True, kernel= 'rbf', gamma='auto', C=1)) #radial basis function kernel
                 ])

In [20]:
classifier = LocalClassifierPerParentNode(local_classifier=pipe1)
classifier.fit(train_for_ML, cat_train)

LocalClassifierPerParentNode(local_classifier=Pipeline(steps=[('scale',
                                                               StandardScaler()),
                                                              ('classifier',
                                                               SVC(C=1,
                                                                   class_weight='balanced',
                                                                   gamma='auto',
                                                                   probability=True))]))

**Accuracy**

In [21]:
predictions = classifier.predict(test_for_ML)
pred_column1, pred_column2, cat_test_column1, cat_test_column2 = return_predicted_categories(predictions)
print('Parent Class Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column1, cat_test_column1)))
print('Subclass Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column2, cat_test_column2)))

Parent Class Prediction accuracy: 81.18%
Subclass Prediction accuracy: 34.14%


**Patient Recognition Rate**

In [22]:
evaluate_recognition_rate(pred_column2)

Summation of patient score:  9.24923146533027
Total Number of Patients:  28
Recognition Rate:  0.3303296951903668


### **Random Forest**

In [23]:
pipe2 = Pipeline([
                 ("scale", StandardScaler()),
                 ("classifier", RandomForestClassifier(n_estimators = 50, random_state = 42))
                 ])

In [24]:
classifier = LocalClassifierPerParentNode(local_classifier=pipe2)
classifier.fit(train_for_ML, cat_train)

LocalClassifierPerParentNode(local_classifier=Pipeline(steps=[('scale',
                                                               StandardScaler()),
                                                              ('classifier',
                                                               RandomForestClassifier(n_estimators=50,
                                                                                      random_state=42))]))

**Accuracy**

In [25]:
predictions = classifier.predict(test_for_ML)
pred_column1, pred_column2, cat_test_column1, cat_test_column2 = return_predicted_categories(predictions)
print('Parent Class Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column1, cat_test_column1)))
print('Subclass Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column2, cat_test_column2)))

Parent Class Prediction accuracy: 78.36%
Subclass Prediction accuracy: 44.49%


**Patient Recognition Rate**

In [26]:
evaluate_recognition_rate(pred_column2)

Summation of patient score:  13.013654032610207
Total Number of Patients:  28
Recognition Rate:  0.4647733583075074


### **KNN**

In [27]:
pipe3 = Pipeline([
                 ("scale",StandardScaler()),
                 ("classifier", KNeighborsClassifier(n_neighbors=5))
                 ])

In [28]:
classifier = LocalClassifierPerParentNode(local_classifier=pipe3)
classifier.fit(train_for_ML, cat_train)

LocalClassifierPerParentNode(local_classifier=Pipeline(steps=[('scale',
                                                               StandardScaler()),
                                                              ('classifier',
                                                               KNeighborsClassifier())]))

**Accuracy**

In [29]:
predictions = classifier.predict(test_for_ML)
pred_column1, pred_column2, cat_test_column1, cat_test_column2 = return_predicted_categories(predictions)
print('Parent Class Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column1, cat_test_column1)))
print('Subclass Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column2, cat_test_column2)))

Parent Class Prediction accuracy: 70.16%
Subclass Prediction accuracy: 26.34%


**Patient Recognition Rate**

In [30]:
evaluate_recognition_rate(pred_column2)

Summation of patient score:  6.810084025216937
Total Number of Patients:  28
Recognition Rate:  0.2432172866148906


### **Adaboost**

In [31]:
pipe4 = Pipeline([
                 ("scale",StandardScaler()),
                 ("classifier", AdaBoostClassifier(n_estimators=200, random_state=42))
                 ])

In [32]:
classifier = LocalClassifierPerParentNode(local_classifier=pipe4)
classifier.fit(train_for_ML, cat_train)

LocalClassifierPerParentNode(local_classifier=Pipeline(steps=[('scale',
                                                               StandardScaler()),
                                                              ('classifier',
                                                               AdaBoostClassifier(n_estimators=200,
                                                                                  random_state=42))]))

**Accuracy**

In [33]:
predictions = classifier.predict(test_for_ML)
pred_column1, pred_column2, cat_test_column1, cat_test_column2 = return_predicted_categories(predictions)
print('Parent Class Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column1, cat_test_column1)))
print('Subclass Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column2, cat_test_column2)))

Parent Class Prediction accuracy: 76.88%
Subclass Prediction accuracy: 40.99%


**Patient Recognition Rate**

In [34]:
evaluate_recognition_rate(pred_column2)

Summation of patient score:  11.90019154113201
Total Number of Patients:  28
Recognition Rate:  0.4250068407547146


### **MLP**

In [43]:
mlp = MLPClassifier(alpha=0.1, max_iter=1000)

In [44]:
classifier = LocalClassifierPerParentNode(local_classifier=mlp)
classifier.fit(train_for_ML, cat_train)

LocalClassifierPerParentNode(local_classifier=MLPClassifier(alpha=0.1,
                                                            max_iter=1000))

**Accuracy**

In [45]:
predictions = classifier.predict(test_for_ML)
pred_column1, pred_column2, cat_test_column1, cat_test_column2 = return_predicted_categories(predictions)
print('Parent Class Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column1, cat_test_column1)))
print('Subclass Prediction accuracy: {:.2%}'.format(accuracy_score(pred_column2, cat_test_column2)))

Parent Class Prediction accuracy: 72.31%
Subclass Prediction accuracy: 33.06%


**Patient Recognition Rate**

In [47]:
evaluate_recognition_rate(pred_column2)

Summation of patient score:  9.35466073351742
Total Number of Patients:  28
Recognition Rate:  0.3340950261970507
