# [1] Automatic feature extraction
- https://www.youtube.com/watch?v=s041lMgOlgs
- Use GLCM and find optimal settings for lung nodule feature extraction then compute the 14 haralick texture features.
- GLCM feature significance -- https://www.researchgate.net/post/What_is_significance_of_GLCM_features

#### Deep learning stuff
- MNIST For ML Beginners --https://www.tensorflow.org/versions/r1.0/get_started/mnist/beginners
- how many images needed for deep learning? --https://www.quora.com/How-many-images-data-do-I-need-to-start-training-a-deep-neural-network-from-scratch-1

## Instructions:
- Don't run 2b and 6b
- After storing the dataframe at the end, run SVM v2.

In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.feature as sk
import mahotas.features as feature
from sklearn import preprocessing

import collections

%matplotlib inline

In [2]:
def glcm_feat_extraction(image):
    # Use GLCM to find features
    #glcm = sk.greycomatrix(image, [1], [0, np.pi/4, np.pi/2, 3*np.pi/4])
    glcm = sk.greycomatrix(image, [1], [0, 3*np.pi/4])
    
    # mean of GLCM directions for rotational invariance
    glcm_mean = np.mean(glcm, axis=3, keepdims=True)
    
    # Find features from GLCM
    contrast = sk.greycoprops(glcm_mean, 'contrast')
    dissimilarity = sk.greycoprops(glcm_mean, 'dissimilarity')
    homogeneity = sk.greycoprops(glcm_mean, 'homogeneity')
    ASM = sk.greycoprops(glcm_mean, 'ASM')
    energy = sk.greycoprops(glcm_mean, 'energy')
    correlation = sk.greycoprops(glcm_mean, 'correlation')
    
    return contrast, homogeneity, ASM, energy   # test
    #return contrast, dissimilarity, homogeneity, ASM, energy, correlation

def LBP_feat_extraction(image):
    # LBP features
    # see: http://scikit-image.org/docs/dev/auto_examples/features_detection/plot_local_binary_pattern.html
    feat_lbp = sk.local_binary_pattern(image, 29, 7, 'uniform')

    # energy and entropy of LBP
    lbp_hist,_ = np.histogram(feat_lbp, 7)
    lbp_hist = np.array(lbp_hist, dtype=float)
    lbp_prob = np.divide(lbp_hist, np.sum(lbp_hist))
    lbp_energy = np.sum(lbp_prob**2)
    #lbp_entropy = -np.sum(np.multiply(lbp_prob, np.log2(lbp_prob)))
    
    #return [lbp_energy, lbp_entropy]
    return [lbp_energy]

def pixel_count(image):
    ct = 0
    for row in nodule_nums:
        for value in row:
            if value > 0:
                #print(value)
                ct += 1
    return [ct]

# by: Josh Lee
# https://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists
def flatten(x):
    if isinstance(x, collections.Iterable):
        return [a for i in x for a in flatten(i)]
    else:
        return [x]
    
#def feature_scale(array):
    # Scales each column separately and appends the columns to an array
#    array_scaled = []
    
#    for column in array:
#        column_scaled = preprocessing.scale(column).tolist()
#        array_scaled.append(column_scaled)
        
#    return np.asarray(array_scaled)

In [3]:
# Select path where patients and their subsequent CT scans are
data_dir = 'C:/Users/moogl/Desktop/UChicago/Capstone Project/2. Implementation/Lung CT Data/SPIE-AAPM Lung CT Challenge/'

# csv to pandas dataframe
coordinates_df = pd.read_csv(data_dir + 'Dataset Info/' + 'Calib and test set labels (with 2 nodule patients).csv')
coordinates_df.sort_values(by=['Patient ID'], inplace=True)
coordinates_df.set_index('Patient ID', drop=True, inplace=True)
coordinates_df.reset_index(inplace=True)

patients = list(coordinates_df['Patient ID'])

coordinates_df.head()

Unnamed: 0,Patient ID,Diagnosis,x,y,Instance Number
0,CT-Training-BE001,benign,405,296,169
1,CT-Training-BE002,benign,184,268,117
2,CT-Training-BE006,benign,449,266,241
3,CT-Training-BE007,benign,385,206,194
4,CT-Training-BE010,benign,120,336,69


# [2] Extract features from nodules (without rotated images)

In [4]:
feat_array = []

for index, nodule in enumerate(patients):    
    if nodule == patients[index-1]:
        path = data_dir + '6. Clean Crop/' + nodule + '_2_cropped.tiff'
    else:
        path = data_dir + '6. Clean Crop/' + nodule + '_cropped.tiff'

    # read nodule image as array of numbers
    nodule_nums = cv2.imread(path, cv2.IMREAD_GRAYSCALE)

    # Feature extraction
    pixel_count_feat = pixel_count(nodule_nums)
    
    glcm_feat = glcm_feat_extraction(nodule_nums)
    glcm_feat = flatten(glcm_feat)
    
    LBP_feat = LBP_feat_extraction(nodule_nums)
    F13 = feature.haralick(nodule_nums).mean(0).tolist()
    
    # store features in temp numpy array
    #feat_array.append((glcm_feat + LBP_feat + F13))
    feat_array.append((pixel_count_feat + glcm_feat + LBP_feat))
    #feat_array.append((pixel_count_feat + glcm_feat + LBP_feat + F13))

  if np.issubdtype(image.dtype, np.float):


# [3] Save feature array in DataFrame

In [5]:
# Convert numpy array to pandas dataframe
feat_array_full = np.array(feat_array)
feat_df = pd.DataFrame(feat_array)

feat_df.columns = ['Number of Pixels' ,'GLCM Contrast', ' GLCMHomogeneity', ' GLCM ASM', 'GLCM Energy', 'LBP Energy']

feat_df.info
# write dataframe to .csv file
#feat_df.to_csv(data_dir + 'Features Dataframe.csv')
feat_df.round(1).head()

Unnamed: 0,Number of Pixels,GLCM Contrast,GLCMHomogeneity,GLCM ASM,GLCM Energy,LBP Energy
0,13,151208.0,3983.5,15864298.8,3983.0,1.0
1,21,224667.0,3974.9,15788718.2,3973.5,1.0
2,295,725757.0,3652.2,13257318.2,3641.1,0.9
3,482,1881115.0,3526.5,12163033.8,3487.6,0.9
4,634,860690.0,3384.7,10983307.2,3314.1,0.9


# [4] Data preprocessing (normalizing and scaling)

- Consider normalizing "Scale input vectors individually to unit norm (vector length)"
- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
- http://benalexkeen.com/feature-scaling-with-scikit-learn/

In [7]:
# "Standardize features by removing the mean and scaling to unit variance"
scaler = preprocessing.StandardScaler()
feat_scaled = scaler.fit_transform(feat_array_full)

feat_scaled_df = pd.DataFrame(feat_scaled)

feat_scaled_df.columns = ['Number of Pixels', 'GLCM Contrast', ' GLCMHomogeneity', ' GLCM ASM', 'GLCM Energy', 'LBP Energy']

feat_scaled_df.round(2).head()

Unnamed: 0,Number of Pixels,GLCM Contrast,GLCMHomogeneity,GLCM ASM,GLCM Energy,LBP Energy
0,-1.17,-1.0,1.2,1.33,1.2,1.62
1,-1.14,-0.94,1.17,1.3,1.18,1.54
2,-0.37,-0.53,0.24,0.26,0.29,0.69
3,0.16,0.43,-0.12,-0.19,-0.12,0.2
4,0.58,-0.42,-0.53,-0.67,-0.58,-0.54


# [5] PCA using sklearn
- In Depth: Principal Component Analysis
-https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html
- SKlearn PCA, SVD Dimensionality Reduction by The SemiColon
-https://www.youtube.com/watch?v=O3qXimMGn28
- StatQuest: Principal Component Analysis (PCA), Step-by-Step by Josh Starmer
-https://www.youtube.com/watch?v=FgakZw6K1QQ

In [198]:
from sklearn.decomposition import PCA

In [199]:
pca = PCA(n_components=4, whiten=True) # was 4 with all features
# with scaling
feat_reduc = pca.fit(feat_scaled_df).transform(feat_scaled_df)

# no scaling
#feat_reduc = pca.fit(feat_df).transform(feat_df)

In [200]:
pca.explained_variance_

array([5.5469172 , 0.43565127, 0.08089857, 0.00795642])

In [201]:
feat_reduc_df = pd.DataFrame(feat_reduc)
feat_reduc_df.head()

Unnamed: 0,0,1,2,3
0,-1.306392,-0.077998,1.397973,1.067861
1,-1.265583,-0.119974,1.226719,1.059346
2,-0.404961,0.29008,1.318471,-0.849168
3,0.132183,-0.547454,0.997631,-0.567614
4,0.4612,1.245188,0.238197,-1.528097


# [6] Append truth values to dataframe (without rotated images)

In [202]:
# adding truth column to x_df

coordinates_df.replace({'Diagnosis' : {'benign':0, 'Benign nodule':0, 'malignant':1, 'Primary lung cancer':1,
                                                        'Primary lung cancer ':1, 'Suspicious malignant nodule':1}}, inplace=True)

# Combining non-rotation and rotation truth values into truth_column
truth_column_no_rota = coordinates_df['Diagnosis']

feat_reduc_df['Truth'] = truth_column_no_rota

feat_reduc_df.head()

Unnamed: 0,0,1,2,3,Truth
0,-1.306392,-0.077998,1.397973,1.067861,0
1,-1.265583,-0.119974,1.226719,1.059346,0
2,-0.404961,0.29008,1.318471,-0.849168,0
3,0.132183,-0.547454,0.997631,-0.567614,0
4,0.4612,1.245188,0.238197,-1.528097,0


# [7] Store dataframe for use in SVM

In [203]:
%store feat_reduc_df

Stored 'feat_reduc_df' (DataFrame)
