<a href="https://colab.research.google.com/github/OlekanmaVictoria/Artificial-intelligence-Data-science-Portfolio/blob/main/AI_Method_for_Distinguishing_Benign_and_Malignant_Colony_Cells.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **AI Method for Distinguishing Benign and Malignant Colony Cells**

## 1. **Introduction**
This notebook provides an implementation of an AI method for distinguishing between benign and malignant colony cells using a convolutional neural network (CNN). The process includes data extraction from PowerPoint files, preprocessing, model training, and evaluation.

## 2. **Setup and Imports**

In [None]:
# Install necessary libraries
!pip install tensorflow opencv-python scikit-learn pandas numpy matplotlib seaborn gdown python-pptx

# Importing required libraries
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model
import gdown
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
from io import BytesIO
from PIL import Image


Collecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl.metadata (18 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.0 python-pptx-0.6.23


## **Explanation:**

Libraries: We install and import libraries required for handling PowerPoint files, image processing, and building the CNN.

# 3. **Data Extraction from PowerPoint Files**

In [None]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Define Google Drive folder ID and download files
file_id = "1LMRPoqs9E8FWBK-eoBDrdVlC4tuIUmH6"
gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}", output="./dataset", quiet=False, use_cookies=False)

# Path to the downloaded dataset
dataset_path = "./dataset"


Retrieving folder contents


Processing file 1nz7gHH8UZs5q-A2i25w1IXmzYN58T10I EPC Colony Quiz 2 - unlabeled70.pptx
Processing file 1L1_Uu6pCLSHH4_aOYvgRkzLUYjNYNYOi EPC Colony Quiz_unlabeled.pptx
Processing file 12S33g21hRqqhYpQGiiUBe8N8PLHjiFU8 EPC-CFU Labeled-images.pptx


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1nz7gHH8UZs5q-A2i25w1IXmzYN58T10I
To: /content/dataset/EPC Colony Quiz 2 - unlabeled70.pptx
100%|██████████| 73.5M/73.5M [00:02<00:00, 30.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1L1_Uu6pCLSHH4_aOYvgRkzLUYjNYNYOi
To: /content/dataset/EPC Colony Quiz_unlabeled.pptx
100%|██████████| 11.7M/11.7M [00:00<00:00, 21.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=12S33g21hRqqhYpQGiiUBe8N8PLHjiFU8
To: /content/dataset/EPC-CFU Labeled-images.pptx
100%|██████████| 15.4M/15.4M [00:00<00:00, 121MB/s]
Download completed


In [None]:
def extract_images_from_pptx(pptx_path, output_dir):
    """Extract images from a PowerPoint file and save them to the specified directory."""
    prs = Presentation(pptx_path)
    image_counter = 0
    for idx, slide in enumerate(prs.slides):
        for shape in slide.shapes:
            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                image = shape.image
                image_bytes = image.blob
                image_stream = BytesIO(image_bytes)
                img = Image.open(image_stream)
                img.save(os.path.join(output_dir, f"slide{idx}_image{image_counter}.png"))
                image_counter += 1

# Create directories for the extracted images
os.makedirs(os.path.join(dataset_path, "EPC_Colony_Quiz_2"), exist_ok=True)
os.makedirs(os.path.join(dataset_path, "EPC_Colony_Quiz"), exist_ok=True)
os.makedirs(os.path.join(dataset_path, "EPC_CFU_Labeled"), exist_ok=True)

# Extract images from each PowerPoint file
extract_images_from_pptx(os.path.join(dataset_path, "EPC Colony Quiz 2 - unlabeled70.pptx"), os.path.join(dataset_path, "EPC_Colony_Quiz_2"))
extract_images_from_pptx(os.path.join(dataset_path, "EPC Colony Quiz_unlabeled.pptx"), os.path.join(dataset_path, "EPC_Colony_Quiz"))
extract_images_from_pptx(os.path.join(dataset_path, "EPC-CFU Labeled-images.pptx"), os.path.join(dataset_path, "EPC_CFU_Labeled"))


**Explanation:**

Function extract_images_from_pptx: Extracts images from PowerPoint files and saves them to a specified directory.
Usage: Paths to the PowerPoint files are provided, and images are saved to corresponding directories.
## 4**.Data Loading and Preprocessing**

In [None]:
def load_data_from_directory(directory_path, label, target_size=(128, 128)):
    """Load images from a directory, resize them, and assign a label."""
    images = []
    labels = []
    for image_name in os.listdir(directory_path):
        image_path = os.path.join(directory_path, image_name)
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is not None:
            resized_image = cv2.resize(image, target_size)
            images.append(resized_image)
            labels.append(label)
    return images, labels

# Load images and labels from all directories with resizing
images_1, labels_1 = load_data_from_directory(os.path.join(dataset_path, "EPC_Colony_Quiz_2"), "benign")
images_2, labels_2 = load_data_from_directory(os.path.join(dataset_path, "EPC_Colony_Quiz"), "benign")
images_3, labels_3 = load_data_from_directory(os.path.join(dataset_path, "EPC_CFU_Labeled"), "malignant")

# Combine all images and labels
images = np.array(images_1 + images_2 + images_3)
labels = np.array(labels_1 + labels_2 + labels_3)

# Data preprocessing
images = images / 255.0  # Normalize pixel values to [0, 1]
images = np.expand_dims(images, axis=-1)  # Add channel dimension

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)


In [None]:
# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Fit the data generator
datagen.fit(X_train)


In [None]:
# Convert labels to integers
label_dict = {label: idx for idx, label in enumerate(np.unique(labels))}
y_train = np.array([label_dict[label] for label in y_train])
y_test = np.array([label_dict[label] for label in y_test])

# Model architecture
image_height, image_width = X_train.shape[1], X_train.shape[2]

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(image_height, image_width, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_dict), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
                    validation_data=(X_test, y_test),
                    epochs=50)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 126, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 63, 63, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 30, 30, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 28, 28, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 14, 14, 128)       0

In [None]:
# Save the model
model.save("benign_malignant_classifier.h5")

# Load and use the saved model
model = load_model("benign_malignant_classifier.h5")


  saving_api.save_model(


In [None]:
# Example prediction
example_image = X_test[0]
prediction = model.predict(np.expand_dims(example_image, axis=0))
predicted_label = np.argmax(prediction, axis=1)
print(f"Predicted label: {predicted_label}")


Predicted label: [0]


# **Dependencies:**

Ensure all required libraries are installed. Use the pip install commands provided.
Data Paths: Update dataset_path and other file paths as necessary to match your local environment.
Model Adjustments: Depending on your dataset and requirements, you may need to tweak the model architecture or parameters.
Feel free to adjust the documentation according to any specific requirements or additional details of your project.

In [None]:
!pip install python-pptx Pillow scikit-image opencv-python xlsxwriter




In [None]:
from google.colab import drive
import os
from pptx import Presentation
from PIL import Image
from io import BytesIO

drive.mount('/content/drive')

# Define dataset path and output directories
dataset_path = '/content/drive/MyDrive/your_dataset_folder'
output_image_dir = '/content/images'

# Create output directory if it doesn't exist
if not os.path.exists(output_image_dir):
    os.makedirs(output_image_dir)

def extract_images_from_pptx(pptx_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    presentation = Presentation(pptx_path)
    for idx, slide in enumerate(presentation.slides):
        for shape in slide.shapes:
            if hasattr(shape, "image"):
                image = shape.image
                image_bytes = image.blob
                image_stream = BytesIO(image_bytes)
                img = Image.open(image_stream)
                img.save(os.path.join(output_dir, f"slide{idx}.png"))



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Extract images from each PowerPoint file
pptx_files = ["EPC Colony Quiz 2 - unlabeled70.pptx", "EPC Colony Quiz_unlabeled.pptx", "EPC-CFU Labeled-images.pptx"]
for pptx_file in pptx_files:
    extract_images_from_pptx(os.path.join(dataset_path, pptx_file), os.path.join(output_image_dir, pptx_file.split('.')[0]))
