Importing libraries

In [None]:
import numpy as np
import pandas as pd

# Pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
import cv2
from tensorflow.keras.utils import to_categorical
# Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional, Conv2D, MaxPooling2D

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split


import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

2024-12-05 10:02:17.320919: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timkuz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Importing dataset

In [None]:
data = pd.read_csv('BreaKHis_v1/histology_slides/breast/image_data.csv')

Check the dataset

In [None]:
data.head()

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7909 entries, 0 to 7908
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   path_to_image        7909 non-null   object
 1   Benign or Malignant  7906 non-null   object
 2   Cancer Type          7905 non-null   object
 3   Magnification        7905 non-null   object
dtypes: object(4)
memory usage: 247.3+ KB


Clean dataset from null values

In [7]:
data_cleaned = data.dropna(subset=['Benign or Malignant', 'Cancer Type'])

In [8]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7905 entries, 0 to 7908
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   path_to_image        7905 non-null   object
 1   Benign or Malignant  7905 non-null   object
 2   Cancer Type          7905 non-null   object
 3   Magnification        7905 non-null   object
dtypes: object(4)
memory usage: 308.8+ KB


Loading and preprocessing images

In [9]:
image_paths = data_cleaned['path_to_image']

In [None]:
def load_and_preprocess_images(image_paths, target_size = (244,244)):
    images = []
    
    for i, path in enumerate(image_paths):
        #print(f"Обрабатывается путь {i + 1}/{len(image_paths)}: {path}")

        # Load image
        image = cv2.imread(path)
        
        # Resize image to target size
        h, w = image.shape[:2]
        scale = min(target_size[0] / h, target_size[1] / w)
        new_size = (int(w * scale), int(h * scale))
        resized_image = cv2.resize(image, new_size)

        # Normalize pixel values to [0, 1]
        resized_image = resized_image / 255.0
        images.append(resized_image)
        #print(f"Путь обработан {i + 1}/{len(image_paths)}: {path}")
    
    return images


In [10]:
data_cleaned['images'] = load_and_preprocess_images(image_paths)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['images'] = load_and_preprocess_images(image_paths)


In [11]:
data_cleaned = data_cleaned.drop(columns = ['path_to_image','Magnification'])

In [None]:
data_cleaned['Benign or Malignant'] = data_cleaned['Benign or Malignant'].map({'Benign': 0, 'Malignant': 1})
data_cleaned['Cancer Type'] = data_cleaned['Cancer Type'].astype('category').cat.codes

    Part1 - Binary

In [13]:
# Split for binary classification
X_train_binary, X_val_binary, y_train_binary, y_val_binary = train_test_split(
    data_cleaned['images'], data_cleaned['Benign or Malignant'], test_size=0.2, random_state=42, stratify=data_cleaned['Benign or Malignant'])

Part2 - Multiclass

In [21]:
# Split for multiclass classification
X_train_multiclass, X_val_multiclass, y_train_multiclass, y_val_multiclass = train_test_split(
    data_cleaned['images'], data_cleaned['Cancer Type'], test_size=0.2, random_state=42, stratify=data_cleaned['Cancer Type'])

# Convert labels to categorical
y_train_multiclass = to_categorical(y_train_multiclass, num_classes=8)
y_val_multiclass = to_categorical(y_val_multiclass, num_classes=8)