In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def load_data():
    train= pd.read_csv('dataset/train_1.csv')
    test= pd.read_csv('dataset/test.csv')
    
    train_dir = os.path.join('dataset/train_images/')
    test_dir = os.path.join('./','dataset/test_images/')
    
    train['file_path'] = train['id_code'].map(lambda x: os.path.join(train_dir, '{}.png'.format(x)))
    test['file_path'] = test['id_code'].map(lambda x: os.path.join(test_dir, '{}.png'.format(x)))
    
    train['file_name']= train['id_code'].apply(lambda x: x+ '.png')
    test['file_name']= test['id_code'].apply(lambda x: x+ '.png')
    
    train['diagnosis'] = train['diagnosis'].astype(str)
    
    return train, test

In [3]:
df_train, df_test =  load_data()

print(df_train.shape, df_test.shape, '\n')

df_train.head()

(2930, 4) (366, 4) 



Unnamed: 0,id_code,diagnosis,file_path,file_name
0,1ae8c165fd53,2,dataset/train_images/1ae8c165fd53.png,1ae8c165fd53.png
1,1b329a127307,1,dataset/train_images/1b329a127307.png,1b329a127307.png
2,1b32e1d775ea,4,dataset/train_images/1b32e1d775ea.png,1b32e1d775ea.png
3,1b3647865779,0,dataset/train_images/1b3647865779.png,1b3647865779.png
4,1b398c0494d1,0,dataset/train_images/1b398c0494d1.png,1b398c0494d1.png


In [4]:
from sklearn.model_selection import train_test_split

df_train_train, df_train_valid= train_test_split(df_train, test_size= 0.2)

df_train_train.shape, df_train_valid.shape

((2344, 4), (586, 4))

### Saving data 

In [6]:
import pickle

file = open('df_train_train', 'wb')
pickle.dump(df_train_train, file)
file.close()

file = open('df_test', 'wb')
pickle.dump(df_test, file)
file.close()


In [8]:
#Loading data 

file = open('df_train_train', 'rb')
df_train_train = pickle.load(file)
file.close()

file = open('df_test', 'rb')
df_train_test = pickle.load(file)
file.close()

In [9]:
df_train_train.head()

Unnamed: 0,id_code,diagnosis,file_path,file_name
1326,77ab222bf85c,0,dataset/train_images/77ab222bf85c.png,77ab222bf85c.png
300,2f7fbdcc9a4b,0,dataset/train_images/2f7fbdcc9a4b.png,2f7fbdcc9a4b.png
2831,de778495a1cd,2,dataset/train_images/de778495a1cd.png,de778495a1cd.png
328,31616ff6b53b,3,dataset/train_images/31616ff6b53b.png,31616ff6b53b.png
401,370f575adb23,0,dataset/train_images/370f575adb23.png,370f575adb23.png


In [10]:
df_test.head()

Unnamed: 0,id_code,diagnosis,file_path,file_name
0,e4dcca36ceb4,0,./dataset/test_images/e4dcca36ceb4.png,e4dcca36ceb4.png
1,e4e343eaae2a,2,./dataset/test_images/e4e343eaae2a.png,e4e343eaae2a.png
2,e4f12411fd85,4,./dataset/test_images/e4f12411fd85.png,e4f12411fd85.png
3,e50b0174690d,0,./dataset/test_images/e50b0174690d.png,e50b0174690d.png
4,e5197d77ec68,0,./dataset/test_images/e5197d77ec68.png,e5197d77ec68.png


In [11]:
len(df_train_train), len(df_test)

(2344, 366)

In [5]:
import multiprocessing
from multiprocessing.pool import ThreadPool
import warnings
import cv2
import numpy as np

In [6]:
IMG_SIZE= 512

def crop_image_from_gray(img,tol=7):
    if img.ndim ==2:
        mask = img>tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    
    elif img.ndim==3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        mask = gray_img>tol
        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0): # image is too dark so that we crop out everything,
            return img # return original image
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
    #         print(img1.shape,img2.shape,img3.shape)
            img = np.stack([img1,img2,img3],axis=-1)
    #         print(img.shape)
        return img
    


def circle_crop(img, sigmaX = 30):   
    """
    Create circular crop around image centre    
    """    
    img = crop_image_from_gray(img)    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    height, width, depth = img.shape    
    
    x = int(width/2)
    y = int(height/2)
    r = np.amin((x,y))
    
    circle_img = np.zeros((height, width), np.uint8)
    cv2.circle(circle_img, (x,y), int(r), 1, thickness=-1)
    img = cv2.bitwise_and(img, img, mask=circle_img)
    img = crop_image_from_gray(img)
    img=cv2.addWeighted(img,4, cv2.GaussianBlur( img , (0,0) , sigmaX) ,-4 ,128)
    return img 


Storing Test images

In [7]:
input_path= "dataset/test_images/"
output_path= "dataset/test_images_resized_preprocessed/"

for root, dirs, files in os.walk(input_path):
    for image_name in files:
        #print(image_name)
        if '.png' in image_name:
            filepath= os.path.join(input_path, image_name)   #Reading path 
            outpath= os.path.join(output_path, image_name)   #Writing Path 
            img= cv2.imread(filepath)
            img= circle_crop(img)
            img= cv2.resize(img,(IMG_SIZE,IMG_SIZE))
            cv2.imwrite(outpath, img)

Validation images

In [41]:
len(df_train_valid)

586

In [42]:
df_train_valid.head()

Unnamed: 0,id_code,diagnosis,file_path,file_name
1207,6e0f78e188ff,0,dataset/train_images/6e0f78e188ff.png,6e0f78e188ff.png
2702,d57d1be1bbd1,0,dataset/train_images/d57d1be1bbd1.png,d57d1be1bbd1.png
630,475c7ded0f7a,2,dataset/train_images/475c7ded0f7a.png,475c7ded0f7a.png
2849,e019b3e0f33d,4,dataset/train_images/e019b3e0f33d.png,e019b3e0f33d.png
343,33105f9b3a04,1,dataset/train_images/33105f9b3a04.png,33105f9b3a04.png


In [44]:
df= pd.DataFrame(df_train_valid)
df.head()

Unnamed: 0,id_code,diagnosis,file_path,file_name
1207,6e0f78e188ff,0,dataset/train_images/6e0f78e188ff.png,6e0f78e188ff.png
2702,d57d1be1bbd1,0,dataset/train_images/d57d1be1bbd1.png,d57d1be1bbd1.png
630,475c7ded0f7a,2,dataset/train_images/475c7ded0f7a.png,475c7ded0f7a.png
2849,e019b3e0f33d,4,dataset/train_images/e019b3e0f33d.png,e019b3e0f33d.png
343,33105f9b3a04,1,dataset/train_images/33105f9b3a04.png,33105f9b3a04.png


In [47]:
df1= df.file_name
df1

1207    6e0f78e188ff.png
2702    d57d1be1bbd1.png
630     475c7ded0f7a.png
2849    e019b3e0f33d.png
343     33105f9b3a04.png
              ...       
50      1dbdc32c17db.png
2789    da9fe02dead3.png
674     4a589edaea60.png
669     4a213b405ee4.png
2198    b13d72ceea26.png
Name: file_name, Length: 586, dtype: object

In [48]:
df1.shape

(586,)