In [1]:
import numpy as np 
import pandas as pd 

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
keras.utils.set_random_seed(13)

In [4]:
train_df = pd.read_csv('train.txt', sep=" ", header=None)
train_df.columns=['patient id', 'file_paths', 'labels', 'data source']
train_df=train_df.drop(['patient id', 'data source'], axis=1 )
train_df.head()

Unnamed: 0,file_paths,labels
0,ARDSSevere.png,negative
1,acute-respiratory-distress-syndrome-ards-1.jpg,negative
2,acute-respiratory-distress-syndrome-ards.jpg,negative
3,ards-secondary-to-tiger-snake-bite.png,negative
4,pneumocystis-pneumonia-2-PA.png,negative


In [5]:
test_df = pd.read_csv('test.txt', sep=" ", header=None)
test_df.columns=['id', 'file_paths', 'labels', 'data source' ]
test_df=test_df.drop(['id', 'data source'], axis=1 )
test_df.head()

Unnamed: 0,file_paths,labels
0,MIDRC-RICORD-1C-419639-003251-46647-0.png,positive
1,MIDRC-RICORD-1C-419639-001464-39871-0.png,positive
2,MIDRC-RICORD-1C-419639-000918-78965-0.png,positive
3,MIDRC-RICORD-1C-419639-003318-64285-0.png,positive
4,MIDRC-RICORD-1C-419639-001015-81591-0.png,positive


In [6]:
file_count = 5000
samples = []
for category in train_df['labels'].unique():    
    category_slice = train_df[train_df['labels']==category]    
    samples.append(category_slice.sample(file_count, replace=False,random_state=1))
train_df = pd.concat(samples, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)
print (train_df['labels'].value_counts())
print (len(train_df))

positive    5000
negative    5000
Name: labels, dtype: int64
10000


In [7]:
target_size=(128,128)
batch_size=64 

In [8]:
train_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

In [9]:
train_path = 'train/'
test_path = 'test/'

In [10]:
train_gen = train_datagen.flow_from_dataframe(
    train_df,
    directory=train_path,
    x_col='file_paths',
    y_col='labels',
    target_size=target_size,
    batch_size=batch_size,
    color_mode="grayscale",
    class_mode="binary",
)

Found 10000 validated image filenames belonging to 2 classes.


In [11]:
test_gen = test_datagen.flow_from_dataframe(
    test_df,
    directory=test_path,
    x_col='file_paths',
    y_col='labels',
    target_size=target_size,
    batch_size=batch_size,
    color_mode="grayscale",
    class_mode="binary",
)

Found 400 validated image filenames belonging to 2 classes.


In [12]:
len(test_gen)

7

In [13]:
from tqdm import tqdm

In [14]:
X_test, y_test = [], []
for _ in tqdm(range(len(test_gen))):
    X, y = test_gen.next()
    X_test.append(X)
    y_test.append(y)
X_test = np.vstack(X_test).astype(np.uint8)
y_test = np.hstack(y_test).astype(np.uint8)

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.61s/it]


In [15]:
X_test.shape, y_test.shape

((400, 128, 128, 1), (400,))

In [16]:
X_train, y_train = [], []
for _ in tqdm(range(len(train_gen))):
    X, y = train_gen.next()
    X_train.append(X)
    y_train.append(y)
X_train = np.vstack(X_train).astype(np.uint8)
y_train = np.hstack(y_train).astype(np.uint8)

100%|████████████████████████████████████████████████████████████████████████████████| 157/157 [01:07<00:00,  2.32it/s]


In [17]:
X_train.shape, y_train.shape

((10000, 128, 128, 1), (10000,))

In [18]:
np_train_path = "train.npz"
np_test_path = "test.npz"

In [19]:
np.savez(np_test_path, X_test, y_test)

In [20]:
np.savez(np_train_path, X_train, y_train)