In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn, optim
from torchvision import transforms, datasets, models
from torch.utils.data.sampler import SubsetRandomSampler

import os

images = "./cell_images/"
parasite_folder = "Parasitized"
unparasite_folder = "Uninfected"
print(os.listdir(images))

['Parasitized', 'Uninfected']


# Listing the paths 
loading image names and removing excess files

In [36]:
parasite_images_name = os.listdir(images + parasite_folder)
unparasite_images_name = os.listdir(images + unparasite_folder)
parasite_images_name.remove("Thumbs.db")
unparasite_images_name.remove("Thumbs.db")

In [37]:
parasite_images_name = np.array(parasite_images_name)
unparasite_images_name = np.array(unparasite_images_name)

# The Dataset is balanced we have 13779 positives and 13779 negatives

In [38]:
parasite_images_name.shape, unparasite_images_name.shape

((13779,), (13779,))

# Constructing the labels for the images

In [39]:
parasite_labels = np.ones(parasite_images_name.shape)
unparasite_labels = np.zeros(unparasite_images_name.shape)

In [40]:
parasite_labels, unparasite_labels, parasite_labels.shape, unparasite_labels.shape

(array([1., 1., 1., ..., 1., 1., 1.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 (13779,),
 (13779,))

In [41]:
parasite_data = pd.DataFrame()

# Loading the full path to the images 

In [42]:
parasite_data['name'] = parasite_images_name
parasite_data['label'] = parasite_labels
parasite_data_pure_names = parasite_data.copy()
parasite_data['name'] = './cell_images/Parasitized/'+ parasite_data['name']

In [43]:
parasite_data.head()

Unnamed: 0,name,label
0,./cell_images/Parasitized/C100P61ThinF_IMG_201...,1.0
1,./cell_images/Parasitized/C100P61ThinF_IMG_201...,1.0
2,./cell_images/Parasitized/C100P61ThinF_IMG_201...,1.0
3,./cell_images/Parasitized/C100P61ThinF_IMG_201...,1.0
4,./cell_images/Parasitized/C100P61ThinF_IMG_201...,1.0


In [44]:
unparasite_data = pd.DataFrame()

In [45]:
unparasite_data['name'] = unparasite_images_name
unparasite_data['label'] = unparasite_labels
unparasite_data_pure_names = unparasite_data.copy()
unparasite_data['name'] = './cell_images/Uninfected/'+ unparasite_data['name']

In [46]:
unparasite_data.head()

Unnamed: 0,name,label
0,./cell_images/Uninfected/C100P61ThinF_IMG_2015...,0.0
1,./cell_images/Uninfected/C100P61ThinF_IMG_2015...,0.0
2,./cell_images/Uninfected/C100P61ThinF_IMG_2015...,0.0
3,./cell_images/Uninfected/C100P61ThinF_IMG_2015...,0.0
4,./cell_images/Uninfected/C100P61ThinF_IMG_2015...,0.0


# Shuffling the DataSet 

In [47]:
# Shuffle images 
frames = [parasite_data, unparasite_data]
data = pd.concat(frames)
data = data.sample(frac=1).reset_index(drop=True)

In [48]:
data_values = data.values

# Dividing the Data into Train/Validation set and Test Set 

In [49]:
test_split = 0.1
train_data_size  = int(np.floor(data_values.shape[0] * (test_split)))

In [50]:
split_index = data_values.shape[0] - train_data_size 

In [51]:
train_val_values = data_values[0:split_index,:]

In [52]:
test_values = data_values[split_index :,:]

In [53]:
train_val_df  = pd.DataFrame()

In [54]:
train_val_df['name'] = train_val_values[:,0]
train_val_df['label'] = train_val_values[:,1]

In [55]:
test_df = pd.DataFrame()
test_df['name'] = test_values[:,0]
test_df['label'] = test_values[:,1]

In [56]:
train_val_df

Unnamed: 0,name,label
0,./cell_images/Uninfected/C228ThinF_IMG_2015111...,0
1,./cell_images/Parasitized/C184P145ThinF_IMG_20...,1
2,./cell_images/Parasitized/C81P42ThinF_IMG_2015...,1
3,./cell_images/Uninfected/C171P132ThinF_IMG_201...,0
4,./cell_images/Uninfected/C84P45ThinF_IMG_20150...,0
5,./cell_images/Uninfected/C84P45ThinF_IMG_20150...,0
6,./cell_images/Uninfected/C51AP12thinF_IMG_2015...,0
7,./cell_images/Parasitized/C63P24N_ThinF_IMG_20...,1
8,./cell_images/Parasitized/C116P77ThinF_IMG_201...,1
9,./cell_images/Uninfected/C50P11thinF_IMG_20150...,0


In [57]:
test_df

Unnamed: 0,name,label
0,./cell_images/Parasitized/C189P150ThinF_IMG_20...,1
1,./cell_images/Parasitized/C132P93ThinF_IMG_201...,1
2,./cell_images/Uninfected/C203ThinF_IMG_2015102...,0
3,./cell_images/Parasitized/C68P29N_ThinF_IMG_20...,1
4,./cell_images/Parasitized/C141P102ThinF_IMG_20...,1
5,./cell_images/Uninfected/C78P39ThinF_IMG_20150...,0
6,./cell_images/Parasitized/C59P20thinF_IMG_2015...,1
7,./cell_images/Parasitized/C174P135NThinF_IMG_2...,1
8,./cell_images/Uninfected/C203ThinF_IMG_2015102...,0
9,./cell_images/Uninfected/C213ThinF_IMG_2015110...,0


In [58]:
train_val_shuffle = train_val_df['label']

In [59]:
all_train = train_val_shuffle.shape[0]
positives_train = train_val_shuffle.sum()
negatives_train = all_train - positives_train



all_train, positives_train, negatives_train

(24803, 12396.0, 12407.0)

In [60]:
test_shuffle = test_df['label']

In [61]:
all_test = test_shuffle.shape[0]
positives_test = test_shuffle.sum()
negatives_test = all_test - positives_test
all_test, positives_test, negatives_test

(2755, 1383.0, 1372.0)

In [62]:
train_val_np = train_val_df['name'].values

# Copying the images in the right folder 

In [64]:
for image_path in train_val_np:
    shutil.copy2(image_path,'./cell_images/'+ image_path.split('/')[-1])
    break # Remove break 

In [65]:
test_np = test_df['name'].values

In [66]:
for image_path in test_np:
    shutil.copy2(image_path,'./cell_images/'+ image_path.split('/')[-1])
    break # Remove break 

In [69]:
train_val_df['name'] = './cell_images/' + train_val_df['name'].str.split('/').str[-1].values

In [70]:
test_df['name'] = './cell_images/' + test_df['name'].str.split('/').str[-1].values

In [71]:
train_val_df

Unnamed: 0,name,label
0,./cell_images/C228ThinF_IMG_20151112_142452_ce...,0
1,./cell_images/C184P145ThinF_IMG_20151203_10272...,1
2,./cell_images/C81P42ThinF_IMG_20150817_121113_...,1
3,./cell_images/C171P132ThinF_IMG_20151119_15342...,0
4,./cell_images/C84P45ThinF_IMG_20150818_101226_...,0
5,./cell_images/C84P45ThinF_IMG_20150818_101412_...,0
6,./cell_images/C51AP12thinF_IMG_20150724_154243...,0
7,./cell_images/C63P24N_ThinF_IMG_20150818_14354...,1
8,./cell_images/C116P77ThinF_IMG_20150930_171844...,1
9,./cell_images/C50P11thinF_IMG_20150724_115141_...,0


In [72]:
test_df

Unnamed: 0,name,label
0,./cell_images/C189P150ThinF_IMG_20151203_14130...,1
1,./cell_images/C132P93ThinF_IMG_20151004_151941...,1
2,./cell_images/C203ThinF_IMG_20151029_105443_ce...,0
3,./cell_images/C68P29N_ThinF_IMG_20150819_13323...,1
4,./cell_images/C141P102ThinF_IMG_20151005_21483...,1
5,./cell_images/C78P39ThinF_IMG_20150606_104106_...,0
6,./cell_images/C59P20thinF_IMG_20150803_111333_...,1
7,./cell_images/C174P135NThinF_IMG_20151127_1354...,1
8,./cell_images/C203ThinF_IMG_20151029_105443_ce...,0
9,./cell_images/C213ThinF_IMG_20151106_112252_ce...,0


# Saving the csv files with the image names 

In [74]:
train_val_df.to_csv('./train_val_with_path.csv')
test_df.to_csv('./test_with_path.csv')

In [76]:
train_val_df['name'] = train_val_df['name'].str.split('/').str[-1].values
test_df['name'] =   test_df['name'].str.split('/').str[-1].values
train_val_df.to_csv('./train_val_no_path.csv')
test_df.to_csv('./test_no_path.csv')

In [78]:
train_val_df

Unnamed: 0,name,label
0,C228ThinF_IMG_20151112_142452_cell_85.png,0
1,C184P145ThinF_IMG_20151203_102721_cell_1.png,1
2,C81P42ThinF_IMG_20150817_121113_cell_180.png,1
3,C171P132ThinF_IMG_20151119_153425_cell_219.png,0
4,C84P45ThinF_IMG_20150818_101226_cell_45.png,0
5,C84P45ThinF_IMG_20150818_101412_cell_6.png,0
6,C51AP12thinF_IMG_20150724_154243_cell_101.png,0
7,C63P24N_ThinF_IMG_20150818_143544_cell_176.png,1
8,C116P77ThinF_IMG_20150930_171844_cell_101.png,1
9,C50P11thinF_IMG_20150724_115141_cell_27.png,0
