Setup

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

In [None]:
!pip install kaggle

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/BreastCancerDataset"

!kaggle datasets download -d paultimothymooney/breast-histopathology-images

In [None]:
%cd /content/gdrive/MyDrive/BreastCancerDataset/

In [None]:
!ls

In [None]:
import zipfile
import time
zf = zipfile.ZipFile('/content/gdrive/MyDrive/BreastCancerDataset/breast-histopathology-images.zip')
for file in tqdm(zf.infolist()):
    zf.extract(file)

Imports

In [3]:
import numpy as np
import pandas as pd
import os
import itertools
import os, stat, time
from os.path import dirname as up
import shutil
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping

Load Data

In [None]:
ids = os.listdir("/content/gdrive/MyDrive/BreastCancerDataset/")
data = []
for id in tqdm(ids):
  try:
    files1 = os.listdir('/content/gdrive/MyDrive/BreastCancerDataset/' + id + '/1/')
    files0 = os.listdir('/content/gdrive/MyDrive/BreastCancerDataset/' + id + '/0/')
    for x in files1:
      data.append('/content/gdrive/MyDrive/BreastCancerDataset/' + id + '/1/' + x)
    for x in files0:
      data.append('/content/gdrive/MyDrive/BreastCancerDataset/' + id + '/0/' + x)
  except:
    FileNotFoundError
len(data)

np.savetxt("data.txt", np.array(data), fmt="%s")

In [None]:
txtfile = open('data.txt')
data = txtfile.read().split('\n')
# data = data[:15000]
# data = data[15000:18000]
# data = data[30000:45000]
# data = data[45000:60000]
# data = data[60000:75000]
# data = data[75000:90000]
# data = data[90000:105000]
# data = data[105000:120000]
# data = data[120000:135000]
data = data[120000:123000]
# data = data[135000:150000]
# data = data[150000:165000]
# data = data[165000:180000]
# data = data[180000:195000]
# data = data[195000:]
len(data)

Vizualization

In [None]:
w = 20
h = 20
fig = plt.figure(figsize=(15, 15))
columns = 10
rows = 10
for i in range(1, columns*rows +1):
    img = mpimg.imread(data[i])
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
plt.show()

Labels and Images creation

In [None]:
import cv2
from tqdm.notebook import tqdm

images=[]
labels=[]
ctr=0
for i in tqdm(data):
    if i.endswith('.png'):
        label = int(i[-5])
        img = cv2.imread(i)
        try:
          img = cv2.resize(img,(100,100))
          images.append(img)
          labels.append(label)
        except:
          ctr+=1
print(ctr, "number of images failed")

In [None]:
np.savetxt("BreastCancerImagesFinal.txt",images , fmt="%s")
np.savetxt("BreastCancerLabelsFinal.txt",labels , fmt="%s")

In [None]:
ctr=0
for i in range(len(images)):
  labels[i] = int(labels[i])
  if(images[i].shape != (100, 100, 3)):
    ctr=ctr+1
print('Number of images with wrong dimensions: ', ctr)

In [None]:
print(len(images), len(labels))

for i, x in enumerate(tqdm(images)):
  if(images[i].shape != (100, 100, 3)):
    images.pop(i)
    labels.pop(i)
    
print(len(images), len(labels))

Create Train and Test datasets

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

y = np.array(labels)
x = np.stack(images)/255

x_train,x_test,y_train,y_test = train_test_split(x, y, random_state=0, test_size=0.3)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

Model

In [None]:
model=Sequential()

model.add(Conv2D(filters=32,kernel_size=(4,4),input_shape=(100,100,3),activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=32,kernel_size=(4,4),activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer ='adam', metrics= ['accuracy'])

model.summary()

In [None]:
import tensorflow as tf

history = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, verbose = 2, batch_size = 250)

In [27]:
model.save('model14.h5')

In [None]:
from google.colab import files
files.download('model14.h5')

Prediction and Testing

In [None]:
%cd /content/gdrive/MyDrive/BreastCancerModels/
!ls

In [36]:
from keras.models import load_model
import os

models = list(filter(lambda x: x.endswith('.h5'), os.listdir('/content/gdrive/MyDrive/BreastCancerModels/')))
names = models
models = list(map(lambda x: load_model(x), models))

In [None]:
threshold = 0.5
total = len(images)
for modelID, model in enumerate(models):
  correct = 0
  wrong = 0
  for id, image in enumerate(tqdm(images)):
    image = np.expand_dims(image, axis=0)
    pred = model.predict(image)[0][0]
    result = 1 if pred>=threshold else 0
    if result == labels[id]:
      correct+=1
    else:
      wrong+=1
  print('For model', names[modelID],' Accuracy:', correct/total,' Correct:', correct, ' Wrong:', wrong)