<h1>An extended machine learning technique for polycystic ovary syndrome detection using ovary ultrasound image</h1>

<h2>Import Neccessary Libraries</h2>

In [None]:
! pip install kaggle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp /content/drive/MyDrive/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d anaghachoudhari/pcos-detection-using-ultrasound-images

In [None]:
! unzip /content/pcos-detection-using-ultrasound-images.zip

In [None]:
! kaggle datasets download -d reedah/polycystic-ovary-ultrasound-images-dataset

In [None]:
! unzip /content/polycystic-ovary-ultrasound-images-dataset.zip

In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from PIL import Image, UnidentifiedImageError
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

<h2>Data Preprocessing</h2>

<h4>Define the data directory path</h4>

In [None]:
# Define the directory containing the training dataset

data_dir = '/content/data/train'

# Define the directory containing the testing dataset

test_dir = '/content/data/test'

<h4>Define the batch size and image size</h4>

In [None]:
# Define the batch size for training

batch_size = 32

# Define the dimensions for the images

img_height = 224

img_width = 224

<h4>Load and preprocess the train dataset</h4>

In [None]:
train_generator = ImageDataGenerator(

    rescale=1.0/255.0,
    horizontal_flip=True,
    rotation_range=0.2,
    zoom_range=0.2
    )

train_ds = train_generator.flow_from_directory(
   data_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode="binary",
    shuffle = False
)

In [None]:
# Calculate the class weights

labels = train_ds.classes

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

class_weights = dict(zip(np.unique(labels), class_weights))

class_weights

In [None]:
# Create a dictionary with the class names as keys and corresponding weights

class_weights_dictionary = {cls: weight for cls, weight in class_weights.items()}

<h4>Remove corrupted images from the test directory</h4>

In [None]:
# Function to remove corrupted images from test_dir

def remove_corrupted_images(directory):

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        try:

            # Try to open the image
            img = Image.open(file_path)
            img.verify()  # Additional verification

        except (UnidentifiedImageError, OSError) as e:

            # If UnidentifiedImageError or OSError occurs, the file is likely corrupted

            print(f'Removing corrupted file: {file_path}')
            os.remove(file_path)

In [None]:
# Remove the corrupted images from test_dir

remove_corrupted_images('/content/data/test/infected')
remove_corrupted_images('/content/data/test/notinfected')

<h4>Reading in the test dataset using ImageDataGenerator

In [None]:
test_generator = ImageDataGenerator(rescale = 1.0 /255.0)

test_ds = test_generator.flow_from_directory(
    test_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode='binary',
    shuffle = False
)

<h2>Hybrid Model</h2>

<h4>Spilt the dataset into training and testing sets</h4>

In [None]:
x_train, y_train, x_test, y_test = train_ds, train_ds.labels, test_ds, test_ds.labels

<h4>Load the feature_extractor</h4>

In [None]:
VGG16_base_model = tf.keras.applications.VGG16(weights = 'imagenet', include_top = False, input_shape=(img_height, img_width, 3))

In [None]:
# Freeze the layers of the ResNet50_base_model

VGG16_base_model.trainable = False

# Define the architecture of the feature_extractor's

inputs = tf.keras.Input(shape = (224, 224, 3))

x = inputs
x = VGG16_base_model(x, training = False)
x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

feature_extractor = tf.keras.Model(inputs, outputs)

# Remove the last layer

feature_extractor = tf.keras.Model(inputs = feature_extractor.input, outputs = feature_extractor.layers[-2].output)

# Print the feature_extractor's summary

feature_extractor.summary()

In [None]:
# Extract the features from the train set

train_features = feature_extractor.predict(x_train)

In [None]:
x_for_stacking = train_features

In [None]:
# Extract the features from the test set

test_features = feature_extractor.predict(x_test)

In [None]:
def get_stacking():

        level0 = []
        level0.append(('Logistic_Regression', LogisticRegression(class_weight = class_weights_dictionary, max_iter = 1000)))
        level0.append(('SVM', SVC(class_weight = class_weights_dictionary)))
        level0.append(('Decision_Tree', DecisionTreeClassifier(class_weight = class_weights_dictionary)))
        level0.append(('KNN', KNeighborsClassifier(weights = 'distance')))
        level0.append(('Naive_Bayes', GaussianNB()))

        level1 = XGBClassifier(scale_pos_weight = class_weights_dictionary[1])

        model = StackingClassifier(estimators = level0, final_estimator = level1, cv = 5)

        return model

In [None]:
stacker = get_stacking()

In [None]:
stacker.fit(x_for_stacking, y_train)

In [None]:
y_pred = stacker.predict(test_features)

In [None]:
class_labels = ['infected', 'notinfected']

In [None]:
print(classification_report(y_test, y_pred, target_names = class_labels,  digits = 4))

In [None]:
train_y_pred = stacker.predict(train_features)

In [None]:
print(classification_report(y_train, train_y_pred, target_names = class_labels,  digits = 4))

In [None]:
cnn_cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))

sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True, xticklabels = class_labels,
            yticklabels = class_labels)

plt.title('Hybrid Model Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

Blurred Dataset A

In [None]:
def blur_images(image):

    # Apply Gaussian blur to the image

    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)

    return blurred_image

train_generator = ImageDataGenerator(

    rescale=1.0/255.0,
    horizontal_flip=True,
    rotation_range=0.2,
    zoom_range=0.2,
    preprocessing_function = blur_images

    )

train_ds = train_generator.flow_from_directory(
   data_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode="binary",
    shuffle = False
)

In [None]:
x_train, y_train, x_test, y_test = train_ds, train_ds.labels, test_ds, test_ds.labels

In [None]:
# Extract the features from the train set

train_features = feature_extractor.predict(x_train)

In [None]:
x_for_stacking = train_features

In [None]:
# Extract the features from the test set

test_features = feature_extractor.predict(x_test)

In [None]:
stacker = get_stacking()

In [None]:
stacker.fit(x_for_stacking, y_train)

In [None]:
y_pred = stacker.predict(test_features)

In [None]:
print(classification_report(y_test, y_pred, target_names = class_labels, digits = 4))

In [None]:
train_y_pred = stacker.predict(train_features)

In [None]:
print(classification_report(y_train, train_y_pred, target_names = class_labels,  digits = 4))

In [None]:
cnn_cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))

sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True, xticklabels = class_labels,
            yticklabels = class_labels)

plt.title('Hybrid Model Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

DATASET B

In [None]:
# Define the directory containing the training dataset

data_dir = '/content/PCOS dataset/train'

# Define the directory containing the testing dataset

test_dir = '/content/PCOS dataset/test'

In [None]:
# Define the batch size for training

batch_size = 128

# Define the dimensions for the images

img_height = 224

img_width = 224

In [None]:
def blur_images(image):

    # Apply Gaussian blur to the image

    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)

    return blurred_image

train_generator = ImageDataGenerator(

    rescale=1.0/255.0,
    horizontal_flip=True,
    rotation_range=0.2,
    zoom_range=0.2,
    preprocessing_function = blur_images

    )

train_ds = train_generator.flow_from_directory(
   data_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode = 'binary',
    shuffle = False
)

In [None]:
# Calculate the class weights

labels = train_ds.classes

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

class_weights = dict(zip(np.unique(labels), class_weights))

class_weights

In [None]:
# Create a dictionary with the class names as keys and corresponding weights

class_weights_dictionary = {cls: weight for cls, weight in class_weights.items()}

In [None]:
# Function to remove corrupted images from test_dir

def remove_corrupted_images(directory):

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:

            # Try to open the image
            img = Image.open(file_path)
            img.verify()  # Additional verification

        except (UnidentifiedImageError, OSError) as e:

            # If UnidentifiedImageError or OSError occurs, the file is likely corrupted

            print(f'Removing corrupted file: {file_path}')
            os.remove(file_path)

In [None]:
# Remove the corrupted images from test_dir

remove_corrupted_images('/content/PCOS dataset/test/infected')
remove_corrupted_images('/content/PCOS dataset/test/notinfected')

In [None]:
test_generator = ImageDataGenerator(rescale = 1.0 /255.0)

test_ds = test_generator.flow_from_directory(
    test_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode='binary',
    shuffle = False
)

In [None]:
x_train, y_train, x_test, y_test = train_ds, train_ds.labels, test_ds, test_ds.labels

In [None]:
# Extract the features from the train set

train_features = feature_extractor.predict(x_train)

In [None]:
x_for_stacking = train_features

In [None]:
# Extract the features from the test set

test_features = feature_extractor.predict(x_test)

In [None]:
stacker = get_stacking()

In [None]:
stacker.fit(x_for_stacking, y_train)

In [None]:
y_pred = stacker.predict(test_features)

In [None]:
class_labels = ['infected', 'notinfected']

In [None]:
print(classification_report(y_test, y_pred, target_names = class_labels, digits = 4))

In [None]:
train_y_pred = stacker.predict(train_features)

In [None]:
print(classification_report(y_train, train_y_pred, target_names = class_labels,  digits = 4))

In [None]:
cnn_cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))

sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True, xticklabels = class_labels,
            yticklabels = class_labels)

plt.title('Hybrid Model Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()

Dataset C

In [None]:
# Define the directory containing the training dataset

data_dir = '/content/data/train'

# Define the directory containing the testing dataset

test_dir = '/content/data/test'

In [None]:
def blur_images(image):

    # Apply Gaussian blur to the image

    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)

    return blurred_image

train_generator = ImageDataGenerator(

    rescale=1.0/255.0,
    horizontal_flip=True,
    rotation_range=0.2,
    zoom_range=0.2,
    preprocessing_function = blur_images

    )

train_ds = train_generator.flow_from_directory(
   data_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode = 'binary',
    shuffle = False
)

In [None]:
# Calculate the class weights

labels = train_ds.classes

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

class_weights = dict(zip(np.unique(labels), class_weights))

class_weights

In [None]:
# Create a dictionary with the class names as keys and corresponding weights

class_weights_dictionary = {cls: weight for cls, weight in class_weights.items()}

In [None]:
test_generator = ImageDataGenerator(rescale = 1.0 /255.0)

test_ds = test_generator.flow_from_directory(
    test_dir,
    target_size = (img_height, img_width),
    batch_size = batch_size,
    class_mode='binary',
    shuffle = False
)

In [None]:
x_train, y_train, x_test, y_test = train_ds, train_ds.labels, test_ds, test_ds.labels

In [None]:
# Extract the features from the train set

train_features = feature_extractor.predict(x_train)

In [None]:
x_for_stacking = train_features

In [None]:
# Extract the features from the test set

test_features = feature_extractor.predict(x_test)

In [None]:
stacker = get_stacking()

In [None]:
stacker.fit(x_for_stacking, y_train)

In [None]:
y_pred = stacker.predict(test_features)

In [None]:
print(classification_report(y_test, y_pred, target_names = class_labels, digits = 4))

In [None]:
train_y_pred = stacker.predict(train_features)

In [None]:
print(classification_report(y_train, train_y_pred, target_names = class_labels,  digits = 4))

In [None]:
cnn_cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))

sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True, xticklabels = class_labels,
            yticklabels = class_labels)

plt.title('Hybrid Model Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

plt.show()