In [2]:
import os
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import shutil
import numpy as np
df_info = pd.DataFrame(columns=['img','height', 'width', 'mode', 'ndim'])

def get_img_data(path: Path):
    with Image.open(path) as img:
        arr = np.array(img)
        mode = img.mode # Te dice si está representado en RGB, en 1 (negro y blanco) o L (tonalidades de grises de 0 a 255)
        # ndim number of dimensions 2 = L ; 3 = (height, width, channels) -> tiene colores
        height, width = arr.shape if mode == 'L' else (np.nan, np.nan)
        ndim = arr.ndim
    return [height, width, mode, ndim]


imgs = Path('./data_raw/')
for img in imgs.iterdir():
    df_info.loc[len(df_info)] = [img]+get_img_data(img)
df_info

Unnamed: 0,img,height,width,mode,ndim
0,data_raw/1609.jpg,512.0,512.0,L,2
1,data_raw/1984.jpg,,,RGB,3
2,data_raw/4787.jpg,512.0,512.0,L,2
3,data_raw/4319.jpg,512.0,512.0,L,2
4,data_raw/1846.jpg,,,RGB,3
...,...,...,...,...,...
7018,data_raw/3947.jpg,,,RGB,3
7019,data_raw/5217.jpg,,,RGB,3
7020,data_raw/3037.jpg,,,RGB,3
7021,data_raw/840.jpg,512.0,512.0,L,2


In [3]:
df_info['mode'].value_counts()

mode
RGB     3926
L       3093
RGBA       3
P          1
Name: count, dtype: int64

Tenemos muchas img que no son blanco y negro nuestro modelo solo va a ser funcional con imagenes en blanco y negro

In [4]:
def modify_img_to_L(path: Path):
    with Image.open(path) as img:
        gray = img.convert("L")
        return gray

for RGB_img in list(df_info[df_info['mode'] != 'L']['img']):
    new_img = modify_img_to_L(RGB_img)
    new_img.save(RGB_img) 

In [5]:
df_info.dtypes

img        object
height    float64
width     float64
mode       object
ndim        int64
dtype: object

In [6]:
df_info['img'] = df_info['img'].astype(str)
for img in imgs.iterdir():
    indice = df_info[df_info['img'] == str(img)].index
    df_info.loc[indice] = [img]+get_img_data(img)
 
df_info['mode'].value_counts()

mode
L    7023
Name: count, dtype: int64

In [7]:
print(df_info['height'].value_counts())
print(df_info['width'].value_counts())
print('min_height', df_info['height'].min())
print('min_width', df_info['width'].min())

height
512.0     4749
225.0      345
236.0      113
630.0       90
442.0       87
          ... 
277.0        1
456.0        1
1446.0       1
1427.0       1
432.0        1
Name: count, Length: 234, dtype: int64
width
512.0     4742
225.0      335
236.0      209
630.0       93
201.0       93
          ... 
444.0        1
1375.0       1
438.0        1
180.0        1
1275.0       1
Name: count, Length: 203, dtype: int64
min_height 168.0
min_width 150.0


Tenemos que redimensionar las img para que todas tengan 512 x 512 ya que son las dimensiones más comunnes en el dataset

In [8]:
def resize_img(path):
    with Image.open(path) as img:
        resized = img.resize((512, 512), Image.LANCZOS)
    return resized

for size_img in list(df_info[(df_info['height'] != 512) | (df_info['width'] != 512)]['img']):
    new_img = resize_img(size_img)
    new_img.save(size_img) 

df_info['img'] = df_info['img'].astype(str)
for img in imgs.iterdir():
    indice = df_info[df_info['img'] == str(img)].index
    df_info.loc[indice] = [img]+get_img_data(img)
    
print(df_info['height'].value_counts())
print(df_info['width'].value_counts())

height
512.0    7023
Name: count, dtype: int64
width
512.0    7023
Name: count, dtype: int64


Acabamos de cambiar todos las img al formato que queremos para poder entrenar el modelo

In [9]:
df_info['img'] = df_info['img'].astype(str)
df_info['img'] = df_info['img'].apply(lambda row : row.split('/')[1])
df_info

Unnamed: 0,img,height,width,mode,ndim
0,1609.jpg,512.0,512.0,L,2
1,1984.jpg,512.0,512.0,L,2
2,4787.jpg,512.0,512.0,L,2
3,4319.jpg,512.0,512.0,L,2
4,1846.jpg,512.0,512.0,L,2
...,...,...,...,...,...
7018,3947.jpg,512.0,512.0,L,2
7019,5217.jpg,512.0,512.0,L,2
7020,3037.jpg,512.0,512.0,L,2
7021,840.jpg,512.0,512.0,L,2


In [10]:
df = pd.read_csv('data_raw.csv')
df_final = pd.merge(left=df, right=df_info,how='inner', on='img')
df_final

Unnamed: 0,img,type,has_cancer,height,width,mode,ndim
0,0.jpg,glioma,1,512.0,512.0,L,2
1,1.jpg,glioma,1,512.0,512.0,L,2
2,2.jpg,glioma,1,512.0,512.0,L,2
3,3.jpg,glioma,1,512.0,512.0,L,2
4,4.jpg,glioma,1,512.0,512.0,L,2
...,...,...,...,...,...,...,...
7018,7018.jpg,,0,512.0,512.0,L,2
7019,7019.jpg,,0,512.0,512.0,L,2
7020,7020.jpg,,0,512.0,512.0,L,2
7021,7021.jpg,,0,512.0,512.0,L,2


In [11]:
def rotate_img(path):
    with Image.open(path) as img:
        rot90 = img.rotate(90)
        rot180 = img.rotate(180)
        rot270 = img.rotate(190)
    return[rot90, rot180, rot270]

for _, row in df_final.iterrows():
    img, img_type, has_cancer, height, width, mode, ndim = row
    img_rotate90, img_rotate180, img_rotate270, = rotate_img(f"./data_raw/{img}")

    img_rotate90.save(f"./data_raw/{img.split('.')[0]}_rotate90.jpg")
    img_rotate180.save(f"./data_raw/{img.split('.')[0]}_rotate180.jpg")
    img_rotate270.save(f"./data_raw/{img.split('.')[0]}_rotate270.jpg")

    df_final.loc[len(df_final)] = f"{img.split('.')[0]}_rotate90.jpg", img_type, has_cancer, height, width, mode, ndim
    df_final.loc[len(df_final)] = f"{img.split('.')[0]}_rotate180.jpg", img_type, has_cancer, height, width, mode, ndim
    df_final.loc[len(df_final)] = f"{img.split('.')[0]}_rotate270.jpg", img_type, has_cancer, height, width, mode, ndim

df_final

Unnamed: 0,img,type,has_cancer,height,width,mode,ndim
0,0.jpg,glioma,1,512.0,512.0,L,2
1,1.jpg,glioma,1,512.0,512.0,L,2
2,2.jpg,glioma,1,512.0,512.0,L,2
3,3.jpg,glioma,1,512.0,512.0,L,2
4,4.jpg,glioma,1,512.0,512.0,L,2
...,...,...,...,...,...,...,...
28087,7021_rotate180.jpg,,0,512.0,512.0,L,2
28088,7021_rotate270.jpg,,0,512.0,512.0,L,2
28089,7022_rotate90.jpg,,0,512.0,512.0,L,2
28090,7022_rotate180.jpg,,0,512.0,512.0,L,2


In [12]:
df_final['has_cancer'].value_counts()

has_cancer
1    20092
0     8000
Name: count, dtype: int64

Está desbalanceado por lo que habrá que balancearlo, haremos un undersample ya que tenemos bastantes img con cáncer

In [13]:
from sklearn.utils import resample

df_has_cancer = df_final[df_final['has_cancer'] == 1]
df_no_cancer = df_final[df_final['has_cancer'] == 0]

df_undersample_cancer = resample(
    df_has_cancer,
    replace=False,
    n_samples=len(df_no_cancer),
    random_state=137
)
df_undersample = pd.concat([df_undersample_cancer, df_no_cancer]).reset_index(drop=True)
print(df_undersample['has_cancer'].value_counts())
df_undersample

has_cancer
1    8000
0    8000
Name: count, dtype: int64


Unnamed: 0,img,type,has_cancer,height,width,mode,ndim
0,3967.jpg,pituitary,1,512.0,512.0,L,2
1,772.jpg,glioma,1,512.0,512.0,L,2
2,876.jpg,glioma,1,512.0,512.0,L,2
3,1539_rotate270.jpg,glioma,1,512.0,512.0,L,2
4,4759_rotate270.jpg,pituitary,1,512.0,512.0,L,2
...,...,...,...,...,...,...,...
15995,7021_rotate180.jpg,,0,512.0,512.0,L,2
15996,7021_rotate270.jpg,,0,512.0,512.0,L,2
15997,7022_rotate90.jpg,,0,512.0,512.0,L,2
15998,7022_rotate180.jpg,,0,512.0,512.0,L,2


In [14]:
df_undersample.to_csv('df_final.csv', index=False)