[View in Colaboratory](https://colab.research.google.com/github/PGrabinski/XRayPneumonia/blob/master/X_Ray.ipynb)

# Chest X-Ray images dataset
### Pneumonia detection model

Dataset taken from [Kaggle Datasets: Chest X-Ray Images (Pneumonia)](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia).

The dataset contains 5683 X-Ray images divided into two classes: *normal* and *pneumonia*.
With addition that the pneumonia can be caused either by bacteria or by viruses what changes the pictured tissues in two different ways.

![Chest X-Ray examples](https://i.imgur.com/jZqpV51.png)



Originally this set is divided into train, validation and test sets. But the validation set contains only $8+8$ samples with a bad statistics what leads to inefficient validation. To bypass the problem, I copied the validation set into the train and started using cross validation.

## Data download
I store the dataset on my Google Drive. To get this data into to VM, I download it via PyDrive into the corresponing folders

In [0]:
def driveSetUp():
  !pip install -U -q PyDrive
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials

  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  return drive
drive = driveSetUp()

In [0]:
def downloadFilesFromFolderByID(folder_id, folder_name):
  drive = driveSetUp()
  import os, sys
  os.makedirs(os.path.join(folder_name, 'normal'), exist_ok=True)
  os.makedirs(os.path.join(folder_name, 'virus'), exist_ok=True)
  os.makedirs(os.path.join(folder_name, 'bacteria'), exist_ok=True)
  listed = drive.ListFile({'q': "'{}' in parents and trashed=false".format(folder_id)}).GetList()
  for i, file in enumerate(listed):
    file = drive.CreateFile({'id':file['id']})
    file_path = folder_name + '/'
    if 'virus' in file['title']:
      file_path += 'virus/virus{}'.format(i)
    elif 'bacteria' in file['title']:
      file_path += 'bacteria/bacteria{}'.format(i)
    else:
      file_path += 'normal/normal{}'.format(i)
    file.GetContentFile('{}.jpg'.format(file_path))
  print('Files from the folder of id: {} have been downloaded into {}.'.format(folder_id, folder_name))

In [0]:
folders_to_download = [
    {'id': '1EbT3bes4ONvGQvfI9oI_d-kmV42vYsU_', 'name': 'test'}, #NORMAL
    {'id': '1jerX6X-ugSY8KqNbDO6hDOdarncp-8Di', 'name': 'test'}, #PNEUMONIA
    {'id': '1onNxF1RQ6Eex7snIs2NPFuO7qcFLGb_c', 'name': 'train'}, #NORMAL
    {'id': '1pXT_pdjWjHNJ5JleOD0dTviafaUWfSGF', 'name': 'train'}, #PNEUMONIA
]

In [0]:
for folder in folders_to_download:
  downloadFilesFromFolderByID(folder['id'], folder['name'])

Files from the folder of id: 1EbT3bes4ONvGQvfI9oI_d-kmV42vYsU_ have been downloaded into test.


## Data loading
Here, we load the downloaded images and immediatly rescale them into $300\times 210$ resolution. Some of the images are in three RGB channels and some are in the greyscale single chanel. We cast the single chanel into the RGB format.

In [0]:
import os
import numpy as np
from scipy import misc
from scipy import ndimage
import matplotlib.pylab as pylab
from keras.utils.np_utils import to_categorical

def loadImages(path, new_size=(300, 210)):
  images = []
  files = os.listdir(path)
  for file in files:
      temp_img = misc.imread(os.path.join(path, file))
      resized_img = misc.imresize(temp_img, new_size)
      if len(resized_img.shape) == 2:
        resized_img = np.stack((resized_img,)*3, axis=-1)
      images.append(resized_img)
  images_array = np.array(images)
  return images_array

def loadData():
  train_normal = loadImages('train/normal')
  train_virus = loadImages('train/virus')
  train_bacteria = loadImages('train/bacteria')
  
#   test_normal = loadImages('test/normal')
#   test_virus = loadImages('test/virus')
#   test_bacteria = loadImages('test/bacteria')
  
  train_targets = np.zeros(len(train_normal) + len(train_virus) + len(train_bacteria))
  train_targets[len(train_normal):len(train_normal)+len(train_virus)] = 1
  train_targets[len(train_normal)+len(train_virus):] = 2
#   train_targets = to_categorical(train_targets)
  
#   test_targets = np.zeros(len(test_normal) + len(test_virus) + len(test_bacteria))
#   test_targets[len(test_normal):len(test_virus)] = 1
#   test_targets[len(test_virus):len(test_bacteria)] = 2
#   test_targets = to_categorical(test_targets)
                           
  
  train = np.concatenate((train_normal, train_virus, train_bacteria))
#   test = np.concatenate(test_normal, test_virus, test_bacteria)
  
  return train, train_targets#(train, train_targets, test, test_targets)

train_images, train_targets = loadData()
# (train_images, train_targets, test_images, test_targets) = loadData()

Using TensorFlow backend.
  if issubdtype(ts, int):
  elif issubdtype(type(size), float):


## Model definition

In [0]:
from keras import models
from keras import layers
from keras import optimizers
import numpy as np

def generateModel():
  model = models.Sequential()
  model.add(layers.Conv2D(16, (3,3), activation='relu', input_shape=(300, 210, 3)))
  model.add(layers.MaxPooling2D((2,2)))
  model.add(layers.Conv2D(32, (3,3), activation='relu'))
  model.add(layers.MaxPooling2D((2,2)))
  model.add(layers.Conv2D(64, (3,3), activation='relu'))
  model.add(layers.MaxPooling2D((2,2)))
  model.add(layers.Conv2D(64, (3,3), activation='relu'))
  model.add(layers.MaxPooling2D((2,2)))
  model.add(layers.Flatten())
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(256, activation='relu'))
  model.add(layers.Dense(3, activation='softmax'))
  model.summary()
  model.compile(optimizer = optimizers.RMSprop(lr=1e-5), loss = 'categorical_crossentropy', metrics=['acc'])
  return model

## Cross validation and learning

In [8]:
from sklearn.model_selection import StratifiedKFold as SKF
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

model = KerasClassifier(build_fn=generateModel, epochs=1, batch_size=1, verbose=0)
skf = SKF(n_splits=5, shuffle=True, random_state=1)
results = cross_val_score(model, train_images, train_targets, cv=skf)
print(results)
print(results.mean())

ValueError: ignored