In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import seaborn as sns
import shutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

2023-06-16 23:35:55.187717: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Leitura da base explicativa das imagens a serem utilizadas no treinamento.
df_train = pd.read_csv("train.csv", sep=",")
df_train

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43739,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False
43740,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False
43741,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False
43742,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True


In [6]:
# Leitura da base explicativa das imagens a serem utilizadas no teste.
df_test = pd.read_csv("test.csv", sep=",")
df_test

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10048,964141995,L,MLO,62.0,0,0,0,,0,,29,False
1,2,10048,1234933874,L,CC,62.0,0,0,0,,0,,29,False
2,2,10048,1577142909,R,MLO,62.0,0,0,0,,0,,29,False
3,2,10048,1842203124,R,CC,62.0,0,0,0,,0,,29,False
4,2,10050,588678397,L,MLO,67.0,0,0,0,,0,,29,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10957,2,9965,1990076391,R,MLO,67.0,0,0,0,,0,,21,False
10958,2,9968,766198919,L,CC,76.0,0,0,0,,0,,48,False
10959,2,9968,2098937312,L,MLO,76.0,0,0,0,,0,,48,False
10960,2,9968,294168046,R,MLO,76.0,0,0,0,,0,,48,False


In [9]:
df_train["img_path"] =  "train_images/" + df_train["patient_id"].astype(str) + "/" + df_train["image_id"].astype(str) + ".png"
df_train

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,img_path
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,train_images/10006/462822612.png
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,train_images/10006/1459541791.png
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,train_images/10006/1864590858.png
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False,train_images/10006/1874946579.png
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True,train_images/10011/220375232.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43739,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False,train_images/9973/1729524723.png
43740,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False,train_images/9989/63473691.png
43741,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False,train_images/9989/1078943060.png
43742,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True,train_images/9989/398038886.png


In [10]:
# Vamos separar os dados em treinamento e validação na proporção 80/20:
df_train_split, df_val_split = train_test_split(df_train, test_size = 0.2, random_state = 123456, stratify = df_train[['cancer']])

print("Número de imagens no conjunto de treinamento: ", df_train_split.shape[0])
print("Distribuição de cancer conjunto de treinamento: ", df_train_split.groupby("cancer").agg(qtde=("site_id","count")).reset_index())
print("Número de imagens no conjunto de validação: ", df_val_split.shape[0])
print("Distribuição de cancer no conjunto de validação: ", df_val_split.groupby("cancer").agg(qtde=("site_id","count")).reset_index())

Número de imagens no conjunto de treinamento:  34995
Distribuição de cancer conjunto de treinamento:     cancer   qtde
0       0  34222
1       1    773
Número de imagens no conjunto de validação:  8749
Distribuição de cancer no conjunto de validação:     cancer  qtde
0       0  8556
1       1   193


In [14]:
# Iremos agora criar 4 diretórios (2 de teste e 2 de validação, um de positivos e outro de negativos para cancer)

dir_path_train = "train_images/baseline/train"
dir_path_val = "train_images/baseline/val"

# Criando os diretórios:
if os.path.exists(dir_path_train):
    shutil.rmtree(dir_path_train)
else:
    os.makedirs(dir_path_train)
    
if os.path.exists(dir_path_val):
    shutil.rmtree(dir_path_val)
else:
    os.makedirs(dir_path_val)
    

# Criando os subdiretórios
os.makedirs(dir_path_train+"/with_cancer")
for path in df_train_split[df_train_split["cancer"]==1]['img_path']:
    shutil.copy2(path, dir_path_train+"/with_cancer")

os.makedirs(dir_path_train+"/wout_cancer")
for path in df_train_split[df_train_split["cancer"]==0]['img_path']:
    shutil.copy2(path, dir_path_train+"/wout_cancer")
    
os.makedirs(dir_path_val+"/with_cancer")
for path in df_val_split[df_val_split["cancer"]==1]['img_path']:
    shutil.copy2(path, dir_path_val+"/with_cancer")
    
os.makedirs(dir_path_val+"/wout_cancer")
for path in df_val_split[df_val_split["cancer"]==0]['img_path']:
    shutil.copy2(path, dir_path_val+"/wout_cancer")