In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import seaborn as sns
import shutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam

In [None]:
# Montando o acesso ao Google Drive na máquina do Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Copiando os arquivos do meu Google Drive para a máquina do Colab.
! cp drive/MyDrive/inf0619_trabalho_final/baseline.zip .

In [None]:
# Extraindo os conjuntos de imagens que estão compactados.
! unzip -qq baseline.zip

In [None]:
# Alterando a nomenclatura dos diretórios.
! mv baseline/train/wout_cancer baseline/train/0
! mv baseline/train/with_cancer baseline/train/1
! mv baseline/val/wout_cancer baseline/val/0
! mv baseline/val/with_cancer baseline/val/1
! mv baseline/test/wout_cancer baseline/test/0
! mv baseline/test/with_cancer baseline/test/1


In [None]:
# Leitura do arquivo .csv que contém as informações dos conjuntos de dados de teste.
df_test = pd.read_csv('/content/drive/My Drive/inf0619_trabalho_final/test.csv', sep=",")
df_test

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10048,964141995,L,MLO,62.0,0,0,0,,0,,29,False
1,2,10048,1234933874,L,CC,62.0,0,0,0,,0,,29,False
2,2,10048,1577142909,R,MLO,62.0,0,0,0,,0,,29,False
3,2,10048,1842203124,R,CC,62.0,0,0,0,,0,,29,False
4,2,10050,588678397,L,MLO,67.0,0,0,0,,0,,29,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10957,2,9965,1990076391,R,MLO,67.0,0,0,0,,0,,21,False
10958,2,9968,766198919,L,CC,76.0,0,0,0,,0,,48,False
10959,2,9968,2098937312,L,MLO,76.0,0,0,0,,0,,48,False
10960,2,9968,294168046,R,MLO,76.0,0,0,0,,0,,48,False


In [None]:
# Identificação dos caminhos para os conjuntos de dados de treinamento, validação e teste.

dir_path_train = "/content/baseline/train"
dir_path_val = "/content/baseline/val"
dir_path_test = "/content/baseline/test"

In [None]:
# Criando geradores de imagem para o tensorflow:
train_datagen = ImageDataGenerator(preprocessing_function = tf.keras.applications.convnext.preprocess_input)
val_datagen = ImageDataGenerator(preprocessing_function = tf.keras.applications.convnext.preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function = tf.keras.applications.convnext.preprocess_input)

batch_size = 128

In [None]:
train_generator = train_datagen.flow_from_directory(
                                                    dir_path_train,
                                                    target_size = (512, 256),
                                                    batch_size = batch_size,
                                                    class_mode = 'binary')

Found 35047 images belonging to 2 classes.


In [None]:
validation_generator = val_datagen.flow_from_directory(
                                                        dir_path_val,
                                                        target_size = (512, 256),
                                                        batch_size = batch_size,
                                                        class_mode = 'binary')

Found 8749 images belonging to 2 classes.


In [None]:
test_generator = test_datagen.flow_from_directory(
                                                  dir_path_test,
                                                  target_size = (512, 256),
                                                  batch_size = batch_size,
                                                  class_mode = 'binary',
                                                  shuffle = False)

Found 10962 images belonging to 2 classes.


In [None]:
# Importação do modelo ConvNeXtXlarge com seus pesos pré-treinados sem a última camada.
from keras.applications.convnext import ConvNeXtXLarge

model_convnextXlarge = ConvNeXtXLarge(weights = 'imagenet', input_shape = (512, 256, 3), include_top = False)

# Mantendo as camadas do modelo congeladas.
for layer in model_convnextXlarge.layers:
    layer.trainable = False

# Inclusão de uma camada de Average Pooling para retirarmos a features do modelo.
x = model_convnextXlarge.output
predictions = GlobalAveragePooling2D()(x)
model = Model(inputs = model_convnextXlarge.input, outputs = predictions)

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/convnext/convnext_xlarge_notop.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512, 256, 3  0           []                               
                                )]                                                                
                                                                                                  
 convnext_xlarge_prestem_normal  (None, 512, 256, 3)  0          ['input_1[0][0]']                
 ization (Normalization)                                                                          
                                                                                                  
 convnext_xlarge_stem (Sequenti  (None, 128, 64, 256  13056      ['convnext_

In [None]:
# Fluxo para retirar as features da rede neural: Conjunto de validação.
num_batches_val = (validation_generator.n // batch_size) + (1 if (validation_generator.n % batch_size) > 0 else 0)

val_features = pd.DataFrame()
val_label = pd.DataFrame()

for step in range(0, num_batches_val):
  if step % 10 == 0: print(step)
  x_data, y_data = validation_generator.next()
  val_label = pd.concat([val_label, pd.DataFrame(y_data).rename(columns={0:"y_label"})])

  feature_extractor = model.predict(x_data, verbose = 0)
  val_features = pd.concat([val_features, pd.DataFrame(feature_extractor.reshape(feature_extractor.shape[0], -1)).add_prefix('f_')])

val_features = val_features.reset_index(drop = True)
val_label = val_label.reset_index(drop = True)

val_features = pd.merge(val_features, val_label, how = 'inner', left_index = True, right_index = True)
val_features["y_label"] = val_features["y_label"].astype(int)

val_features

0
10
20
30
40
50
60


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_2039,f_2040,f_2041,f_2042,f_2043,f_2044,f_2045,f_2046,f_2047,y_label
0,0.321539,-0.040745,0.428116,-0.059310,-0.133182,0.175785,0.591816,0.306358,-0.328405,0.046211,...,-0.424844,0.040012,0.586366,0.635589,-0.076856,-0.437887,-0.465474,1.808983,0.126085,0
1,0.863328,0.109210,0.181452,-0.085717,-0.551607,0.085090,1.017017,0.784318,-0.169223,-0.233436,...,-0.302629,-0.072603,0.252117,0.107895,-0.716553,-0.464415,-0.210476,0.532940,0.215660,0
2,0.621498,0.046494,0.217638,0.145566,-0.485521,0.327986,0.750339,0.611870,-0.171179,-0.328355,...,-0.366927,-0.123831,0.744125,0.057504,-0.457560,-0.575832,-0.753164,1.621300,0.161117,0
3,0.506821,0.039643,0.293630,0.082756,-0.610796,0.257617,0.502576,0.563442,-0.172584,-0.004929,...,-0.184850,-0.212367,0.742930,0.269302,-0.607481,-0.517907,0.032292,1.026711,-0.079377,0
4,0.343853,0.142178,0.240785,-0.037477,-0.291627,0.154225,0.494200,0.634773,-0.230634,-0.193642,...,-0.209280,-0.270954,-0.044520,-0.142486,-0.645625,-0.579330,0.050568,0.474704,0.272606,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8744,0.611580,0.176106,0.166006,-0.016327,-0.383235,-0.096023,0.574808,0.675598,-0.176140,-0.290138,...,-0.139385,-0.120450,0.237770,0.236707,-0.268464,-0.461777,-0.557336,0.491386,-0.034084,0
8745,0.683683,0.120825,0.093502,0.078464,-0.406492,-0.034391,0.704150,0.753817,-0.197076,-0.182104,...,-0.368558,0.000458,0.417836,0.044448,-0.347991,-0.409663,-0.511226,0.626749,0.208214,0
8746,0.396443,0.082155,0.226763,-0.363815,-0.513668,0.275729,1.005085,0.886076,-0.193303,-0.109343,...,-0.359865,-0.161590,0.858085,0.043154,-0.619210,-0.665731,-0.065258,0.156513,-0.181718,0
8747,0.338843,0.079850,0.308706,0.071187,-0.347819,0.183485,0.395378,0.292172,-0.128342,-0.051909,...,-0.307717,-0.260249,0.455521,0.593497,-0.247874,-0.395693,-0.606585,1.424962,0.210196,0


In [None]:
val_features.to_parquet("/content/drive/My Drive/inf0619_trabalho_final/features/convnextXLarge_val.parquet")

In [None]:
# Fluxo para retirar as features da rede neural: Conjunto de testes.
num_batches_test = (test_generator.n // batch_size) + (1 if (test_generator.n % batch_size) > 0 else 0)

test_features = pd.DataFrame()
test_label = pd.DataFrame()
test_filename = pd.DataFrame()

for step in range(0, num_batches_test):
  if step % 10 == 0: print(step)
  x_data, y_data = test_generator.next()
  test_filename = pd.concat([test_filename, pd.DataFrame(test_generator.filenames[step*batch_size : (step+1)*batch_size]).rename(columns={0:"image_id"})])
  test_label = pd.concat([test_label, pd.DataFrame(y_data).rename(columns={0:"y_label"})])

  feature_extractor = model.predict(x_data, verbose = 0)
  test_features = pd.concat([test_features, pd.DataFrame(feature_extractor.reshape(feature_extractor.shape[0], -1)).add_prefix('f_')])

test_features = test_features.reset_index(drop = True)
test_label = test_label.reset_index(drop = True)
test_filename = test_filename.reset_index(drop = True)

test_features = pd.merge(test_features, test_label, how = 'inner', left_index = True, right_index = True)
test_features = pd.merge(test_features, test_filename, how = 'inner', left_index = True, right_index = True)
test_features["y_label"] = test_features["y_label"].astype(int)

test_features

0
10
20
30
40
50
60
70
80


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_2040,f_2041,f_2042,f_2043,f_2044,f_2045,f_2046,f_2047,y_label,image_id
0,0.286125,0.077281,0.267690,-0.125403,-0.687328,-0.132657,0.668090,0.402662,-0.041012,0.106530,...,-0.091259,0.424867,0.104299,-0.351052,-0.481592,-0.666510,-0.100360,-0.191770,0,0/1000066573.png
1,-0.305934,0.140496,0.391718,-0.064438,-0.265048,0.048315,0.344041,0.107235,-0.299967,0.104992,...,-0.076161,0.204854,-0.019851,-0.055830,-0.471144,-0.297103,0.373199,0.368893,0,0/1000108611.png
2,0.921715,0.062716,0.244886,-0.112345,-0.397146,0.180339,0.535863,0.552860,-0.193361,-0.221842,...,-0.279156,0.578675,0.097681,-0.606028,-0.477128,-0.356049,0.994484,0.022545,0,0/1000203906.png
3,0.785796,0.152850,0.236910,-0.368685,-0.155804,0.472846,0.628691,0.404735,-0.282790,-0.181130,...,-0.420171,0.404125,0.118087,-0.615040,-0.637877,0.380481,0.518340,-0.085029,0,0/1000258681.png
4,0.035032,0.120664,0.374342,-0.132364,-0.413670,0.310536,0.228358,0.243928,-0.168072,-0.102217,...,-0.323548,0.116086,-0.324835,-0.451384,-0.612161,0.610117,0.328234,0.068671,0,0/1000302588.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10957,0.485129,0.181815,0.272408,0.262057,-0.343794,0.205493,0.535985,0.630040,-0.118662,-0.279074,...,-0.228504,0.386979,0.253333,-0.578923,-0.437621,-0.404740,1.090507,0.403114,1,1/958689703.png
10958,0.646605,0.061876,0.346888,0.301016,-0.563852,0.006470,0.413340,0.677135,-0.138933,-0.305872,...,-0.273646,0.450300,0.268438,-0.738518,-0.614721,-0.221428,0.799248,0.344966,1,1/965929379.png
10959,0.558188,0.023168,0.234508,-0.075685,-0.354230,0.318419,0.656652,0.439914,-0.092305,-0.354679,...,0.032902,0.706356,0.380293,-0.527200,-0.385016,-0.746180,0.633622,-0.011195,1,1/967529354.png
10960,0.492668,0.082487,0.502299,0.267491,-0.183774,0.059318,0.544935,0.270806,-0.298761,0.224533,...,-0.183267,0.313364,0.859388,-0.150158,-0.312363,-0.063996,0.987154,0.479303,1,1/970290638.png


In [None]:
test_features.to_parquet("/content/drive/My Drive/inf0619_trabalho_final/features/convnextXLarge_test_patient.parquet")

In [None]:
# Fluxo para retirar as features da rede neural: Conjunto de treinamento.
num_batches_train = (train_generator.n // batch_size) + (1 if (train_generator.n % batch_size) > 0 else 0)

train_features = pd.DataFrame()
train_label = pd.DataFrame()

for step in range(0, num_batches_train):
  if step % 10 == 0: print(step)
  x_data, y_data = train_generator.next()
  train_label = pd.concat([train_label, pd.DataFrame(y_data).rename(columns={0:"y_label"})])

  feature_extractor = model.predict(x_data, verbose = 0)
  train_features = pd.concat([train_features, pd.DataFrame(feature_extractor.reshape(feature_extractor.shape[0], -1)).add_prefix('f_')])

train_features = train_features.reset_index(drop = True)
train_label = train_label.reset_index(drop = True)

train_features = pd.merge(train_features, train_label, how = 'inner', left_index = True, right_index = True)
train_features["y_label"] = train_features["y_label"].astype(int)

train_features

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_2039,f_2040,f_2041,f_2042,f_2043,f_2044,f_2045,f_2046,f_2047,y_label
0,0.782490,0.096954,0.262080,0.174282,-0.383177,0.104748,0.793320,0.775638,-0.223562,-0.396620,...,-0.317172,-0.119336,0.818237,0.200412,-0.663302,-0.748892,-0.571000,1.043246,0.003858,0
1,0.467280,0.101803,0.288125,0.040087,-0.410418,-0.237152,0.341802,0.433974,-0.192482,0.037039,...,-0.266931,-0.290033,0.212844,0.351380,-0.399469,-0.499037,-0.221880,0.537133,0.417149,0
2,0.563512,0.124270,0.226611,-0.317395,-0.197211,-0.133061,0.081092,0.485044,-0.144118,0.179420,...,-0.046226,0.087496,0.694079,-0.119628,-0.022707,-0.252247,0.190731,0.298398,-0.074016,0
3,0.383937,0.133545,0.356170,0.253072,-0.400867,0.311865,0.506546,0.474224,-0.158302,-0.186319,...,-0.432675,-0.232704,0.153082,0.503808,-0.302560,-0.505552,-0.094205,0.752288,0.403607,0
4,0.539969,0.056070,0.322341,0.298526,-0.480753,0.181412,0.406155,0.609025,-0.085398,-0.393190,...,-0.245263,-0.200124,0.673855,0.247383,-0.650726,-0.496275,-0.265739,0.853555,0.260939,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35042,0.722012,0.042699,0.231991,-0.113181,-0.224732,0.285437,0.714974,0.477097,-0.136725,-0.361651,...,-0.293015,0.138114,1.075602,0.371254,-0.496169,-0.480217,-0.472666,1.221435,-0.078261,0
35043,0.346065,0.004064,0.327129,0.046828,-0.418746,-0.046233,0.164071,0.316719,-0.122912,-0.151308,...,-0.263095,-0.154106,0.457960,0.195209,-0.304282,-0.428850,-0.348681,1.173619,0.139302,0
35044,0.625576,0.118291,0.466354,-0.180872,-0.424132,0.363793,0.622168,0.332305,-0.072464,-0.151749,...,-0.207177,-0.180458,0.559305,-0.098789,-0.488502,-0.570178,0.593500,0.914358,-0.163439,0
35045,0.488206,0.091432,0.250394,-0.064005,-0.341453,0.255575,0.530682,0.545281,-0.220475,-0.172525,...,-0.132825,-0.355615,0.414869,0.194000,-0.731557,-0.567873,0.336933,0.360110,0.007725,0


In [None]:
train_features.to_parquet("/content/drive/My Drive/inf0619_trabalho_final/features/convnextXLarge_train.parquet")