In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.preprocessing import image 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from tensorflow.image import rgb_to_grayscale

In [None]:
def normal_nonnormal(x): 
    if x == 'Normal': 
        return x 
    else: 
        return 'Non-Normal'

df = pd.read_csv('dataset/Chest_xray_Corona_Metadata.csv')
na_fill = {'Label_1_Virus_category': 'Normal'}
df = df.fillna(value = na_fill)

df['BinaryCategory'] = df['Label_1_Virus_category'].map(normal_nonnormal)
df = df.join(pd.get_dummies(df.BinaryCategory.values, prefix = 'type')) # one hot
df = df[['ImagePath', 'BinaryCategory', 'type_Non-Normal', 'Dataset_type']]

X = df[['ImagePath', 'BinaryCategory', 'Dataset_type']]
y = df[['type_Non-Normal']]


x_train = X[X.Dataset_type == 'TRAIN'].drop('Dataset_type', axis=1)
y_train = y[X.Dataset_type == 'TRAIN']
x_test = X[X.Dataset_type == 'TEST'].drop('Dataset_type', axis=1)
y_test = y[X.Dataset_type == 'TEST']

x_train = x_train.drop('BinaryCategory', axis = 1)
x_test = x_test.drop('BinaryCategory', axis = 1)


def get_image_value(path):
    img = image.load_img(path, target_size = (96,96,3))
    img = image.img_to_array(img)

    return img/255


def get_data(df, dataset_type):
    from tqdm import tqdm
    img_list = []
    for path in tqdm(df.ImagePath.values, desc = f'Gathering {dataset_type} Image Arrays'):
        folder = 'train' if dataset_type == 'TRAIN' else 'test'
        full_path = f'dataset/Corona-Chest-XRay-Dataset/{folder}/{path}'
        img_list.append(get_image_value(full_path))
    return np.array(img_list).squeeze()

x_train_data = get_data(X[X.Dataset_type == 'TRAIN'], 'TRAIN')
x_test_data = get_data(X[X.Dataset_type == 'TEST'], 'TEST')

x_train = x_train_data
x_test = x_test_data

Gathering TRAIN Image Arrays: 100%|██████████| 5286/5286 [01:39<00:00, 53.32it/s]
Gathering TEST Image Arrays: 100%|██████████| 624/624 [00:10<00:00, 56.77it/s]


In [13]:
import pickle 
from imblearn.over_sampling import SMOTE


smote = SMOTE() 
print('Old Shape', x_train.shape)

# Get dimensions
n_samples, h, w, c = x_train.shape

# 1) Flatten images to 2D for SMOTE
x_train_flat = x_train.reshape(n_samples, -1)   # same as (n_samples, h*w*c)

# 2) Apply SMOTE
x_train_resampled_flat, y_train_resampled = smote.fit_resample(x_train_flat, y_train)

# 3) Reshape back to image format
x_train_resampled = x_train_resampled_flat.reshape(-1, h, w, c)

print('New Shape', x_train_resampled.shape)

# 4) Save SMOTE-augmented training data
pickle.dump(x_train_resampled, open('pickles/SMOTE_x_train.p', 'wb'))
pickle.dump(y_train_resampled, open('pickles/SMOTE_y_train.p', 'wb'))

# 5) Save original (normal) splits
pickle.dump(y_test,  open('pickles/Normal_y_test.p', 'wb'))
pickle.dump(x_test,  open('pickles/Normal_x_test.p', 'wb'))
pickle.dump(x_train, open('pickles/Normal_x_train.p', 'wb'))
pickle.dump(y_train, open('pickles/Normal_y_train.p', 'wb'))


Old Shape (5286, 96, 96, 3)
New Shape (7888, 96, 96, 3)
