In [1]:
import numpy as np
import pandas as pd
import os
import PIL
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import shutil
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# we added the separation and header because the data is not organized, one can run without sep and header
#to understand the difference

train_df = pd.read_csv('train.txt', sep=" ", header=None)

#Columns are added because it was seen that column names were 0,1,2,3, so new column names are added
#which are given in descriptions
train_df.columns=['patient id', 'filename', 'class', 'data source']

# Since we are doing image classification, patient id and data source is of no importance to us, so
#we cn drop them
train_df=train_df.drop(['patient id', 'data source'], axis=1 )

Let's do the same for test set!

I highly encourage beginners to read the data without sep and header parameters to understand the difference!

In [3]:

#same as train
test_df = pd.read_csv('test.txt', sep=" ", header=None)
test_df.columns=['id', 'filename', 'class', 'data source' ]
test_df=test_df.drop(['id', 'data source'], axis=1 )

In [4]:
train_df.head() # see the first 5 rows and columns of train

Unnamed: 0,filename,class
0,ARDSSevere.png,negative
1,acute-respiratory-distress-syndrome-ards-1.jpg,negative
2,acute-respiratory-distress-syndrome-ards.jpg,negative
3,ards-secondary-to-tiger-snake-bite.png,negative
4,pneumocystis-pneumonia-2-PA.png,negative


In [5]:
test_df.head()#see the first 5 columns for test

Unnamed: 0,filename,class
0,MIDRC-RICORD-1C-419639-003251-46647-0.png,positive
1,MIDRC-RICORD-1C-419639-001464-39871-0.png,positive
2,MIDRC-RICORD-1C-419639-000918-78965-0.png,positive
3,MIDRC-RICORD-1C-419639-003318-64285-0.png,positive
4,MIDRC-RICORD-1C-419639-001015-81591-0.png,positive


In [6]:
train_path = 'train/'  #directory path
test_path = 'test/'

In [7]:
train_df['class'].value_counts()

positive    16490
negative    13992
Name: class, dtype: int64

And the negative values are 13793 and positive values are 2158.

We need to balance them, else the model we create will be more biased towards negative and thereby wrong predictions.

In [8]:
# negative  = train_df[train_df['class']=='negative']   #negative values in class column
# positive = train_df[train_df['class']=='positive']  #positive values in class column
# from sklearn.utils import resample
# #majority class that  is negative, we need to downsample/decrease that class so that there is no bias
# #n_samples = 2158 means we want 2158 sample of class negative, since there are 2158 samples of class positive
# df_majority_downsampled = resample(negative, replace = True, n_samples = 2158) 
# #concatenate
# train_df = pd.concat([positive, df_majority_downsampled])

# from sklearn.utils import shuffle
# train_df = shuffle(train_df) # shuffling so that there is particular sequence

In [9]:
train_df['class'].value_counts()

positive    16490
negative    13992
Name: class, dtype: int64

Awesome! Now no imbalanced data! Proceed!

Now we will split the train data into train(for training the model) and valid(for validation) and then after training and validation we will use the model to predict on test set. Simple.

In [10]:
train_df, valid_df = train_test_split(train_df, train_size=0.9, random_state=0)

In [11]:
#Let's see how many images for training and validation and testing

print(f"Negative and positive values of train: {train_df['class'].value_counts()}")
print(f"Negative and positive values of validation: {valid_df['class'].value_counts()}")
print(f"Negative and positive values of test: {test_df['class'].value_counts()}")

Negative and positive values of train: positive    14830
negative    12603
Name: class, dtype: int64
Negative and positive values of validation: positive    1660
negative    1389
Name: class, dtype: int64
Negative and positive values of test: positive    200
negative    200
Name: class, dtype: int64


In [12]:
#Let's start the modelling task
# The ImageDataGenerator for keras is awesome.
#It lets you augment your images in real-time while your model is still training! 
#You can apply any random transformations on each training image as it is passed to the model. 
#This will not only make your model robust but will also save up on the overhead memory!


#We will apply the Image Data Generator on training with various parameters, but we won't apply 
#the same parameters on testin. Why?
# Because we want the test iamges as it is, we don't want biasedness,
#also if we fit it we will be applying
# the model only on these test images only, it can't predict new images if fed into model
#Because new images will not be augmented this way


train_datagen = ImageDataGenerator(rescale = 1./255.,rotation_range = 40, width_shift_range = 0.2, height_shift_range = 0.2, 
                                   shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True, vertical_flip =True)
test_datagen = ImageDataGenerator(rescale = 1.0/255.)

#Now fit the them to get the images from directory (name of the images are given in dataframe) with augmentation


train_gen = train_datagen.flow_from_dataframe(dataframe = train_df, directory=train_path, x_col='filename', 
                                              y_col='class', target_size=(200,200), batch_size=64, 
                                               class_mode='binary')
valid_gen = test_datagen.flow_from_dataframe(dataframe = valid_df, directory=train_path, x_col='filename',
                                             y_col='class', target_size=(200,200), batch_size=64, 
                                            class_mode='binary')
test_gen = test_datagen.flow_from_dataframe(dataframe = test_df, directory=test_path, x_col='filename', 
                                            y_col='class', target_size=(200,200), batch_size=64,
                                             class_mode='binary')
#class mode binary because we want the classifier to predict covid or not
#target size (200,200) means we want the images to resized to 200*200

Found 27433 validated image filenames belonging to 2 classes.
Found 3049 validated image filenames belonging to 2 classes.
Found 400 validated image filenames belonging to 2 classes.


Now start the transfer learning!

In [13]:
import tensorflow as tf
#Our base model is InceptionResNetV2, new readers are encouraged to see the architecture of this particular model

base_model = tf.keras.applications.ResNet50V2(weights='imagenet', input_shape = (200,200,3),
                                                     include_top=False)
for layer in base_model.layers:
    layer.trainable = False

AttributeError: 'str' object has no attribute 'decode'

# Model building

In [14]:
#Now we will add some more layers to the base model for our requirements

model = tf.keras.Sequential([
#     base_model, 
    tf.keras.layers.GlobalAveragePooling2D(), 
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(), 
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])

callbacks = [
    tf.keras.callbacks.ModelCheckpoint("covid_classifier_model.h5", save_best_only=True, verbose = 0),
    tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss', verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

model.compile(optimizer = keras.optimizers.Adam(learning_rate=0.001),
              loss = 'binary_crossentropy',
              metrics=['accuracy'])


In [None]:
history = model.fit(train_gen, 
                    validation_data=valid_gen, epochs=20, 
                    callbacks=[callbacks])

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


2021-12-01 01:08:58.849475: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-12-01 01:08:58.850198: W tensorflow/core/platform/profile_utils/cpu_utils.cc:126] Failed to get CPU frequency: 0 Hz


 71/429 [===>..........................] - ETA: 4:43 - loss: 0.6995 - accuracy: 0.5025

# Model evaluation and predictions

In [None]:
model.load_weights('./covid_classifier_model.h5')
model.evaluate(test_gen)

In [None]:
preds = (model.predict(test_gen)>0.5).astype("int32")

preds

## If you like it ot fork it, then upvote it! This gives us motivation to produce more notebooks for the community!