<h1>A Machine Learning Hybrid Approach for PCOS Detection Using Ovarian Ultrasound Images</h1>

<h3>Installing Kaggle<h3>

In [None]:
! pip install kaggle

<h3> Mount google drive to save the models and other variables in this notebook <h3>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

<h3> Several operations were carried out below to download the datast from Kaggle </h3>

In [None]:
# Making a directory in the colab session that would hold my kaggle API

! mkdir ~/.kaggle

In [None]:
# Copying the kaggle API credentials file to the newly created kaggle directory

! cp /content/drive/MyDrive/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# Using the kaggle API to access my kaggle account

! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Dowloading the pcos-detection-using-ultrasound-images dataset into the collab session using my kaggle account

! kaggle datasets download -d anaghachoudhari/pcos-detection-using-ultrasound-images

In [None]:
# Unzipping the pcos-detection-using-ultrasound-images dataset because Kaggle datasets are often in zipped format

! unzip /content/pcos-detection-using-ultrasound-images.zip

<h3> Importing the libraries used in this notebook </h3>

In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from PIL import Image, UnidentifiedImageError
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

<h2>Data Preprocessing</h2>

<h4>Define the data directory path</h4>

In [None]:
# Define the directory containing the training dataset

data_dir = '/content/data/train'

# Define the directory containing the testing dataset

test_dir = '/content/data/test'

<h4>Define the batch size and image size</h4>

In [None]:
# Define the batch size for training

batch_size = 64

# Define the dimensions for the images

img_height = 224

img_width = 224

<h4>Load and preprocess the train dataset using ImageDataGenerator</h4>

+ Data augmentaion was applied to the train set to enhance learning
+ The augmentaions applied can be seen below

In [None]:
train_generator = ImageDataGenerator(

    rescale = 1.0/255.0,     # Rescale the pixel values to range 0 - 1
    horizontal_flip = True,  # Randomly flips the images horizontally
    rotation_range = 0.2,    # Randomly rotates the images by up to 20%
    zoom_range = 0.2         # Randomly zooms the images by up to 20%
    )

train_ds = train_generator.flow_from_directory(
  data_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode="binary",
    shuffle = False
)

<h3> Assigning class weights </h3>

+ The dataset is not balanced and so to handle the issue of imbalancing, we assign class weights to the classes to ensure that they contribute equally in the training of the models

In [None]:
# Calculate the class weights

# Getting the class labels in the dataset
labels = train_ds.classes

# Using the compute_class_weight method from the sklearn module to calculate the class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

# Create a dictionary with the class names as keys and corresponding weights
class_weights = dict(zip(np.unique(labels), class_weights))

class_weights

<h4>Remove corrupted images from the test directory</h4>

+ Upon looking at the images in the test directory for this dataset, it was observed that there was a number of corrupted images and so to fix that a function was built as seen below

In [None]:
# Function to remove corrupted images from test_dir

def remove_corrupted_images(directory):

  # Start iterating through the specified directory
  for filename in os.listdir(directory):

    # Defining the file path by joining the directory with the filename
    file_path = os.path.join(directory, filename)

    # trying to open and verify the image in file path
    try:
      # Try to open the image
      img = Image.open(file_path)
      img.verify()  # Additional verification

    # If the verification failed remove the image at the file path
    except (UnidentifiedImageError, OSError) as e:
      # If UnidentifiedImageError or OSError occurs, the file is likely corrupted
      print(f'Removing corrupted file: {file_path}')
      os.remove(file_path)

In [None]:
# Calling the remove_corrupted_images function on the test directory to remove the corrupted images present

remove_corrupted_images('/content/data/test/infected')
remove_corrupted_images('/content/data/test/notinfected')

<h4>Preprocess the test dataset using ImageDataGenerator</h4>

In [None]:
test_generator = ImageDataGenerator(rescale = 1.0 /255.0)  # Rescale the pixel values to range 0 - 1

test_ds = test_generator.flow_from_directory(
    test_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode='binary',
    shuffle = False
)

<h2>Hybrid Model</h2>

<h4>Spilt the dataset into training and testing sets</h4>

In [None]:
x_train, y_train, x_test, y_test = train_ds, train_ds.labels, test_ds, test_ds.labels

<h4>Load the VGG16 base_model</h4>

In [None]:
%%time
VGG16_base_model = tf.keras.applications.VGG16(weights = 'imagenet', include_top = False, input_shape=(img_height, img_width, 3))

<h4>Define the architecture of the feature_extractor</h4>

In [None]:
# Freeze the layers of the VGG16_base_model

VGG16_base_model.trainable = False

# Define the architecture of the feature_extractor's

inputs = tf.keras.Input(shape = (224, 224, 3))

x = inputs
x = VGG16_base_model(x, training = False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)

outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

feature_extractor = tf.keras.Model(inputs, outputs)

# Remove the last layer

feature_extractor = tf.keras.Model(inputs = feature_extractor.input, outputs = feature_extractor.layers[-2].output)

# Print the feature_extractor's summary

feature_extractor.summary()

In [None]:
%%time
# Extract the features from the train set

train_features = feature_extractor.predict(x_train)

In [None]:
x_for_stacking = train_features

In [None]:
%%time
# Extract the features from the test set

test_features = feature_extractor.predict(x_test)

<h3>Stacking Model 2</h3>

<h3> Create a function to define the architecture of the stacking model</h3>

In [None]:
# Function to define the architecture of the stacking model

def get_stacking():

        level0 = []
        level0.append(('Random_Forest', RandomForestClassifier(class_weight = class_weights)))
        level0.append(('XGBoost', XGBClassifier(scale_pos_weight = class_weights[1])))

        level1 = LogisticRegression(class_weight = class_weights)

        model = StackingClassifier(estimators = level0, final_estimator = level1, cv = 5)

        return model

In [None]:
# Calling the get_stacking function

stacker = get_stacking()

In [None]:
%%time
# Fitting the x_for_stacking and y_train on the stacked model

stacker.fit(x_for_stacking, y_train)

<h3> Create a function to generate classification report and confusion matrix </h3>

In [None]:
# Function to generate classification report and confusion matrix

def generate_classification_report_and_confusion_matrix(y_pred):

  # Define the class labels
  class_labels = ['infected', 'notinfected']

  # Define the true or actual labels of the test dataset
  y_true = test_ds.labels

  # Printing the classification report
  print(classification_report(y_true, y_pred, target_names = class_labels, digits = 4))

  # Plotting the confusion matrix
  cnn_cm = confusion_matrix(y_true, y_pred)

  plt.figure(figsize = (10, 8))

  sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True, xticklabels = class_labels,
              yticklabels = class_labels)

  plt.title('Confusion Matrix')
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')

  plt.show()

<h4>Get the classification report and confusion matrix</h4>

In [None]:
%%time
# Getting y_pred

y_pred = stacker.predict(test_features)

In [None]:
generate_classification_report_and_confusion_matrix(y_pred)

In [None]:
# Function to generate classification report and confusion matrix for training

def generate_classification_report_and_confusion_matrix_training(train_y_pred):

  # Define the class labels
  class_labels = ['infected', 'notinfected']

  # Define the true or actual labels of the test dataset
  y_true = test_ds.labels

  # Printing the classification report
  print(classification_report(y_train, train_y_pred, target_names = class_labels, digits = 4))

  # Plotting the confusion matrix
  cnn_cm = confusion_matrix(y_train, train_y_pred,)

  plt.figure(figsize = (10, 8))

  sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True, xticklabels = class_labels,
              yticklabels = class_labels)

  plt.title('Confusion Matrix')
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')

  plt.show()

<h4>Check for overfitting</h4>

In [None]:
# Getting train_y_pred

train_y_pred = stacker.predict(train_features)

In [None]:
# Check for overfitting by generating the classification report and confusion matrix for training

generate_classification_report_and_confusion_matrix_training(train_y_pred)

<h2>Testing Hybrid Model on a Low Quality Dataset A</h2>

<h4>Preprocess the test dataset using ImageDataGenerator</h4>

In [None]:
# Function to blur images to create a low quality dataset

def blur_images(image):

    # Apply Gaussian blur to the image

    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)

    return blurred_image

test_generator = ImageDataGenerator(

    rescale=1.0 / 255.0,   # Rescale the pixel values to range 0 - 1

    preprocessing_function = blur_images
)

test_ds = test_generator.flow_from_directory(
    test_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode='binary',
    shuffle = False
)

<h4>Spilt the dataset into training and testing sets</h4>

In [None]:
x_train, y_train, x_test, y_test = train_ds, train_ds.labels, test_ds, test_ds.labels

In [None]:
%%time
# Extract the features from the train set

train_features = feature_extractor.predict(x_train)

In [None]:
x_for_stacking = train_features

In [None]:
%%time
# Extract the features from the test set

test_features = feature_extractor.predict(x_test)

In [None]:
# Calling the get_stacking function

stacker = get_stacking()

In [None]:
%%time
# Fitting the x_for_stacking and y_train on the stacked model

stacker.fit(x_for_stacking, y_train)

<h4>Get the classification report and confusion matrix</h4>

In [None]:
%%time
# Getting y_pred

y_pred = stacker.predict(test_features)

In [None]:
generate_classification_report_and_confusion_matrix(y_pred)

<h4>Check for overfitting</h4>

In [None]:
# Getting train_y_pred

train_y_pred = stacker.predict(train_features)

In [None]:
# Check for overfitting by generating the classification report and confusion matrix for training

generate_classification_report_and_confusion_matrix_training(train_y_pred)