In [1]:
import os
import numpy as np
import pandas as pd
import cv2

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics

In [2]:
#Define absolute paths to the 6 subdirectories containing the x-ray images
basepath = os.getcwd() + "\\chest_xray\\"

testnormal_path = basepath + "test\\NORMAL\\"
trainnormal_path = basepath + "train\\NORMAL\\"
valnormal_path = basepath + "val\\NORMAL\\"

testpneumonia_path = basepath + "test\\PNEUMONIA\\"
trainpneumonia_path = basepath + "train\\PNEUMONIA\\"
valpneumonia_path = basepath + "val\\PNEUMONIA\\"

pathlist = [testnormal_path, trainnormal_path, valnormal_path, testpneumonia_path, trainpneumonia_path, valpneumonia_path]

In [3]:
#Get the dataframe holding compressed images for each 
def get_df(path, dtype, label):
    imgs = list()
    dirents = os.listdir(path)
    for dirent in dirents[0:200]:
        img_path = path + "\\" + dirent
        imgs.append((get_img(img_path, resized=True), label))
    df = pd.DataFrame(imgs, columns=['image', 'label'])
    return df

In [45]:
#Return the image matrix as read by opencv python library, either in a compressed form or in its given size.
def get_img(path, resized):
    img = cv2.imread(path)
    if(resized==False):
        return img
#    return cv2.resize(img, (64, 64), interpolation=cv2.INTER_AREA).reshape(1, -1)
    blue,green,red = cv2.split(cv2.resize(img, (500, 500), interpolation=cv2.INTER_AREA))
    pca = PCA(30)
    blue = pca.fit_transform(blue)
    green = pca.fit_transform(green)
    red = pca.fit_transform(red)
    #plt.imshow((np.dstack((pca.inverse_transform(red), pca.inverse_transform(green), pca.inverse_transform(blue)))).astype(np.uint8))
    return (np.dstack((red, green, blue))).reshape(1, -1)

### Exploration of Dataset
The dataset contains a total of 5856 images across the test, training, and validation sets. All images have labels. 
Across a sample size of 100 jpeg images in each of the 6 provided directories, the images have an average size of 1126 x 1495 pixels with 3 channels (R, G, B).

In [5]:
heights = []
widths = []
channels = []
num_images = 0
for path in pathlist:
    dirents = os.listdir(path)
    num_images += len(dirents)
    for dirent in dirents[0:100]:
        img_path = path + "\\" + dirent
        img = get_img(img_path, resized=False)
        heights.append(img.shape[0]), widths.append(img.shape[1]), channels.append(img.shape[2])
print(f'Total number of images: {num_images:d}')
print(f'Avg size of images (pixels x pixels x channels): {np.mean(heights):.0f} x {np.mean(widths):.0f} x {np.mean(channels):.0f}')

Total number of images: 5856
Avg size of images (pixels x pixels x channels): 1126 x 1495 x 3


### Pre-processing & Feature Extraction

Using the OpenCV python library, we will convert the jpeg images from the training data into compressed, 128x128 images

In [46]:
df_train = get_df(trainnormal_path, 'train', 'NORMAL')
df_train = df_train.append(get_df(trainpneumonia_path, 'train', 'PNEUMONIA'))

In [47]:
X_train = []
y_train = df_train.label
for img in df_train.image.values:
    X_train.append(img[0])
X_train = np.asarray(X_train)

In [42]:
print(np.shape(df_train['image'][0].values[0]))
print(np.shape(X_train))

(1500, 30)
(400, 1, 90)


In [48]:
df_test = get_df(testnormal_path, 'test', 'NORMAL')
df_test = df_test.append(get_df(testpneumonia_path, 'test', 'PNEUMONIA'))

X_test = []
y_test = df_test.label
for img in df_test.image.values:
    X_test.append(img[0])
X_test = np.asarray(X_test)

### Processing of dataset
1. KNN
    - Using sklearn's KNeighborsClassifier

In [49]:
kvals = [1, 3, 5, 10, 20, 50, 100]
acc_list = []
for k in kvals:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    test_predictions = model.predict(X_test)
    df_test['k=' + str(k)] = test_predictions
    acc = metrics.accuracy_score(y_test, test_predictions)
    acc_list.append(('k=' + str(k), acc))

In [50]:
model = SVC(kernel="linear", C=1)
model.fit(X_train, y_train)
test_predictions = model.predict(X_test)
df_test['SVC'] = test_predictions
acc = metrics.accuracy_score(y_test, test_predictions)
acc_list.append(('SVC', acc))

In [51]:
model = GaussianNB()
model.fit(X_train, y_train)
test_predictions = model.predict(X_test)
df_test['GNB'] = test_predictions
acc = metrics.accuracy_score(y_test, test_predictions)
acc_list.append(('GNB', acc))

In [52]:
print(df_test)
print(acc_list)

                                                 image      label        k=1  \
0    [[921.8022066333933, 921.8022066333933, 921.80...     NORMAL  PNEUMONIA   
1    [[329.20385025021966, 329.20385025021966, 329....     NORMAL  PNEUMONIA   
2    [[-721.6124432160682, -721.6124432160682, -721...     NORMAL  PNEUMONIA   
3    [[-143.56643942432686, -143.56643942432686, -1...     NORMAL  PNEUMONIA   
4    [[-692.8630967867736, -692.8630967867736, -692...     NORMAL  PNEUMONIA   
..                                                 ...        ...        ...   
195  [[-250.58686512955728, -250.58686512955728, -2...  PNEUMONIA  PNEUMONIA   
196  [[1666.251805690097, 1666.251805690097, 1666.2...  PNEUMONIA  PNEUMONIA   
197  [[1138.6491668369342, 1138.6491668369342, 1138...  PNEUMONIA  PNEUMONIA   
198  [[-170.45911781609885, -170.45911781609885, -1...  PNEUMONIA  PNEUMONIA   
199  [[122.3298192455306, 122.3298192455306, 122.32...  PNEUMONIA  PNEUMONIA   

           k=3        k=5       k=10   