In [1]:
!pip install visualkeras



In [3]:
# Standard library for system-specific parameters and functions
import sys

# Standard library for interacting with the operating system
import os
from os import listdir

# Libraries for numerical operations
import numpy as np

# Libraries for creating visualizations
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# Library for data manipulation and analysis
import pandas as pd

# Library for image processing
from PIL import Image

# Standard library for generating random numbers
import random

# Set the seed for the random number generator for reproducibility
random.seed(100)
np.random.seed(100)

# Library for displaying images
import matplotlib.image as mpimg

# Library for machine learning and deep learning tasks
import tensorflow as tf

# Library for computer vision tasks
import cv2

# Library for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# Metrics for model evaluation
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
# Keras utilities for one-hot encoding
from tensorflow.keras.utils import to_categorical

# Keras libraries for building sequential models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Optimizer library from Keras
from tensorflow.keras.optimizers import Adam

# Library for augmenting image data
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Utility function from Keras for plotting model architecture
from tensorflow.keras.utils import plot_model

# Pre-trained models from Keras
from tensorflow.keras.applications import VGG16

# Keras layers for various purposes
from tensorflow.keras.layers import GlobalAveragePooling2D, Input

# Callbacks for early stopping and model checkpoint during training
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Library for using pre-trained models from TensorFlow Hub
import tensorflow_hub as hub

# Library for visualizing neural network architectures
import visualkeras

# Library to suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility in TensorFlow
tf.random.set_seed(100)


In [4]:
# Import the drive module from the google.colab package.
# This module is specifically designed for Google Colab notebooks to interact with Google Drive.
from google.colab import drive

# Mounts the Google Drive to the specified mount point in the Colab environment.
# This allows you to access your Google Drive files directly from the notebook.
# '/content/drive' is the common mount point, providing a path to access the drive contents.
drive.mount('/content/drive')

# After executing this, you'll be prompted to authorize access to your Google Drive.
# You'll need to click on the provided link, sign in to your Google account if not already signed in,
# and copy the authorization code back into this notebook to complete the mounting process.
# Once mounted, you can access your Google Drive files under '/content/drive/My Drive/'.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
# Read DICOM information from a CSV file
dicom_df = pd.read_csv('/content/drive/MyDrive/csv/dicom_info.csv')
dicom_df.head()  # Display the first few rows of the dataframe

Unnamed: 0,file_path,image_path,AccessionNumber,BitsAllocated,BitsStored,BodyPartExamined,Columns,ContentDate,ContentTime,ConversionType,...,SecondaryCaptureDeviceManufacturerModelName,SeriesDescription,SeriesInstanceUID,SeriesNumber,SmallestImagePixelValue,SpecificCharacterSet,StudyDate,StudyID,StudyInstanceUID,StudyTime
0,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.12930...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.129308...,,16,16,BREAST,351,20160426,131732.685,WSD,...,MATLAB,cropped images,1.3.6.1.4.1.9590.100.1.2.129308726812851964007...,1,23078,ISO_IR 100,20160720.0,DDSM,1.3.6.1.4.1.9590.100.1.2.271867287611061855725...,214951.0
1,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.24838...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.248386...,,16,16,BREAST,3526,20160426,143829.101,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.248386742010678582309...,1,0,ISO_IR 100,20160720.0,DDSM,1.3.6.1.4.1.9590.100.1.2.161516517311681906612...,193426.0
2,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.26721...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.267213...,,16,16,BREAST,1546,20160503,111956.298,WSD,...,MATLAB,full mammogram images,1.3.6.1.4.1.9590.100.1.2.267213171011171858918...,1,0,ISO_IR 100,20160807.0,DDSM,1.3.6.1.4.1.9590.100.1.2.291043622711253836701...,161814.0
3,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.38118...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.381187...,,16,16,BREAST,97,20160503,115347.77,WSD,...,MATLAB,cropped images,1.3.6.1.4.1.9590.100.1.2.381187369611524586537...,1,32298,ISO_IR 100,20170829.0,DDSM,1.3.6.1.4.1.9590.100.1.2.335006093711888937440...,180109.0
4,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.38118...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.381187...,,8,8,Left Breast,3104,20160503,115347.77,WSD,...,MATLAB,,1.3.6.1.4.1.9590.100.1.2.381187369611524586537...,1,0,ISO_IR 100,,DDSM,1.3.6.1.4.1.9590.100.1.2.335006093711888937440...,


In [55]:
# Display a concise summary of the DataFrame, including the number of non-null entries, column data types, and memory usage
dicom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 38 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   file_path                                    10237 non-null  object 
 1   image_path                                   10237 non-null  object 
 2   AccessionNumber                              0 non-null      float64
 3   BitsAllocated                                10237 non-null  int64  
 4   BitsStored                                   10237 non-null  int64  
 5   BodyPartExamined                             10237 non-null  object 
 6   Columns                                      10237 non-null  int64  
 7   ContentDate                                  10237 non-null  int64  
 8   ContentTime                                  10237 non-null  float64
 9   ConversionType                               10237 non-null  object 
 10

In [56]:
# Display unique values in the 'SeriesDescription' column
# This is useful for understanding the different types of series available in the dataset
dicom_df.SeriesDescription.unique()

array(['cropped images', 'full mammogram images', nan, 'ROI mask images'],
      dtype=object)

In [57]:
# Define the base directory where mammogram images are stored
base_image_directory = '/content/drive/MyDrive/jpeg'

# Retrieve image paths from the DataFrame based on the type of mammogram image described in 'SeriesDescription'
full_mammograms = dicom_df[dicom_df['SeriesDescription'] == 'full mammogram images']['image_path']
cropped_mammograms = dicom_df[dicom_df['SeriesDescription'] == 'cropped images']['image_path']
roi_mask_mammograms = dicom_df[dicom_df['SeriesDescription'] == 'ROI mask images']['image_path']

# Adjust the paths to match the new base directory
full_mammograms = full_mammograms.apply(lambda path: path.replace('CBIS-DDSM/jpeg', base_image_directory))
cropped_mammograms = cropped_mammograms.apply(lambda path: path.replace('CBIS-DDSM/jpeg', base_image_directory))
roi_mask_mammograms = roi_mask_mammograms.apply(lambda path: path.replace('CBIS-DDSM/jpeg', base_image_directory))

# Display the path of the first full mammogram image for verification
print("First full mammogram image path:", full_mammograms.iloc[0])

First full mammogram image path: /content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.100.1.2.248386742010678582309005372213277814849/1-249.jpg


In [58]:
# Print the shape of the full mammogram images dataframe to understand the dimensions of the data
full_mammograms.shape

(2857,)

In [59]:
# Access the first element in the 'cropped_images' series to get the path of the first cropped image.
# This can be used for further processing, such as loading and displaying the image.
cropped_mammograms.iloc[0]

'/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.100.1.2.129308726812851964007517874181459556304/1-172.jpg'

In [60]:
# Print the shape of the cropped_images dataframe to understand the dimensions of the data
cropped_mammograms.shape

(3567,)

In [61]:
# Access the first element in the 'roi_mask_images' series to get the path of the first roi_mask_images.
# This can be used for further processing, such as loading and displaying the image.
roi_mask_mammograms.iloc[0]

'/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.100.1.2.153339052913121382622526066491844156138/2-270.jpg'

In [62]:
# Print the shape of the roi_mask_images dataframe to understand the dimensions of the data
roi_mask_mammograms.shape

(3247,)

In [63]:
# Initialize dictionaries to store paths of full mammogram, cropped images, and ROI mask images
full_mammogram_dict = dict()
cropped_dict = dict()
roi_mask_dict = dict()

# Populate the full mammogram dictionary with image paths
for dicom in full_mammograms:
    key = dicom.split("/")[5]  # Extract a unique key from the image path
    full_mammogram_dict[key] = dicom  # Map the key to the corresponding image path

# Populate the cropped images dictionary with image paths
for dicom in cropped_mammograms:
    key = dicom.split("/")[5]  # Extract a unique key from the image path
    cropped_dict[key] = dicom  # Map the key to the corresponding image path

# Populate the ROI mask images dictionary with image paths
for dicom in roi_mask_mammograms:
    key = dicom.split("/")[5]  # Extract a unique key from the image path
    roi_mask_dict[key] = dicom  # Map the key to the corresponding image path


In [64]:
# Retrieve the first item from the dictionary of full mammogram images.
# The next(iter()) construct is used to get the first key-value pair from the dictionary.
# This is useful for inspecting or processing the first element without needing to know the key.
first_item_full_mammogram = next(iter(full_mammogram_dict.items()))
print(first_item_full_mammogram )

('1.3.6.1.4.1.9590.100.1.2.248386742010678582309005372213277814849', '/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.100.1.2.248386742010678582309005372213277814849/1-249.jpg')


In [65]:
# Get the size in bytes of the dictionary full_mammogram_dict and display it
# sys.getsizeof is used to get the memory usage of the object in bytes
# This can be useful for understanding the memory overhead of storing data structures
sys.getsizeof(full_mammogram_dict)

147552

In [66]:
# Retrieve the first item from the dictionary of cropped_images images.
# The next(iter()) construct is used to get the first key-value pair from the dictionary.
# This is useful for inspecting or processing the first element without needing to know the key.
first_item_cropped_dict = next(iter((cropped_dict.items())))
print(first_item_cropped_dict)

('1.3.6.1.4.1.9590.100.1.2.129308726812851964007517874181459556304', '/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.100.1.2.129308726812851964007517874181459556304/1-172.jpg')


In [67]:
# Calculate the memory size of the cropped_images variable in bytes
sys.getsizeof(cropped_mammograms)

594817

In [68]:
# Retrieve the first item from the dictionary of roi_mask_dict images.
# The next(iter()) construct is used to get the first key-value pair from the dictionary.
# This is useful for inspecting or processing the first element without needing to know the key.
first_item_roi_mask = next(iter((roi_mask_dict.items())))
print(first_item_roi_mask)


('1.3.6.1.4.1.9590.100.1.2.153339052913121382622526066491844156138', '/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.100.1.2.153339052913121382622526066491844156138/2-270.jpg')


In [69]:
# Calculate the memory size of the roi_mask_images variable in bytes
sys.getsizeof(roi_mask_mammograms)

541457

In [87]:
# Load mass case descriptions for training and testing datasets
mass_train_df = pd.read_csv('/content/drive/MyDrive/csv/mass_case_description_train_set.csv')
mass_test_df = pd.read_csv('/content/drive/MyDrive/csv/mass_case_description_test_set.csv')

# Load calcification case descriptions for training and testing datasets
calc_case_train_df = pd.read_csv('/content/drive/MyDrive/csv/calc_case_description_train_set.csv')
calc_case_test_df = pd.read_csv('/content/drive/MyDrive/csv/calc_case_description_test_set.csv')

In [88]:
# Display the first few rows of the mass_train_df dataframe to get an overview of the dataset
mass_train_df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,Mass-Training_P_00004_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....


In [89]:
# Display the first few rows of the mass_test_df dataframe to get an overview of the dataset
mass_test_df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,Mass-Test_P_00032_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....


In [73]:
# Display the first few rows of the calc_train_df dataframe to get an overview of the dataset
calc_case_train_df.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00005,3,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...
1,P_00005,3,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....
2,P_00007,4,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00007,4,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00008,1,LEFT,CC,1,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Training_P_00008_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...


In [80]:
# Display the first few rows of the calc_train_df dataframe to get an overview of the dataset
calc_case_test_df.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00038,2,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....
3,P_00038,2,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....
4,P_00038,2,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...


In [75]:
# Calculate the distribution of benign and malignant cases in the training dataset
training_case_distribution = mass_train_df['pathology'].value_counts()

# Calculate the distribution of benign and malignant cases in the test dataset
testing_case_distribution = mass_test_df['pathology'].value_counts()

# Display the counts of benign and malignant cases in the training dataset
print("Distribution of Cases in the Mass Training Set:")
print(training_case_distribution)

# Display the counts of benign and malignant cases in the test dataset
print("\nDistribution of Cases in the Mass Test Set:")
print(testing_case_distribution)

Distribution of Cases in the Mass Training Set:
pathology
MALIGNANT                  637
BENIGN                     577
BENIGN_WITHOUT_CALLBACK    104
Name: count, dtype: int64

Distribution of Cases in the Mass Test Set:
pathology
BENIGN                     194
MALIGNANT                  147
BENIGN_WITHOUT_CALLBACK     37
Name: count, dtype: int64


In [76]:
# Count the number of benign and malignant cases in the training dataset
train_case_counts = calc_case_train_df['pathology'].value_counts()

# Count the number of benign and malignant cases in the test dataset
test_case_counts = calc_case_test_df['pathology'].value_counts()

# Display the counts for the training dataset
print("Calcification Training Set Case Counts:")
print(train_case_counts)

# Display the counts for the test dataset
print("\nCalcification Test Set Case Counts:")
print(test_case_counts)


Calcification Training Set Case Counts:
pathology
MALIGNANT                  544
BENIGN                     528
BENIGN_WITHOUT_CALLBACK    474
Name: count, dtype: int64

Calcification Test Set Case Counts:
pathology
BENIGN                     130
MALIGNANT                  129
BENIGN_WITHOUT_CALLBACK     67
Name: count, dtype: int64


In [90]:
# Function to correct image paths in the given dataset
def updated_path_mass(dataset):
    """
    Corrects the image paths in the given dataset.

    Args:
        dataset (pd.DataFrame): The dataset containing image paths to be corrected.

    Returns:
        None: The dataset is modified in place.
    """
    # Iterate through each row in the dataset
    for i in range(len(dataset)):
        # Correct the path for full mammogram images
        full_mammo_path = dataset.iloc[i, 11]
        img_name = full_mammo_path.split("/")[2]
        if img_name in full_mammogram_dict:
            dataset.iloc[i, 11] = full_mammogram_dict[img_name]

        # Correct the path for cropped images
        cropped_img_path = dataset.iloc[i, 12]
        img_name = cropped_img_path.split("/")[2]
        if img_name in cropped_dict:
            dataset.iloc[i, 12] = cropped_dict[img_name]

        # Correct the path for ROI mask images
        roi_mask_path = dataset.iloc[i, 13]
        img_name = roi_mask_path.split("/")[2]
        if img_name in roi_mask_dict:
            dataset.iloc[i, 13] = roi_mask_dict[img_name]


In [91]:
# Correct the file paths for the training dataset of mass images
updated_path_mass(mass_train_df)

# Correct the file paths for the testing dataset of mass images
updated_path_mass(mass_test_df)

In [92]:
# Display the first few rows of the mass_train_df dataframe
mass_train_df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...


In [93]:
# Display the first few rows of the mass_test_df dataframe
mass_test_df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...


In [94]:
# Fix the image paths in the calc_train_df dataframe
updated_path_mass(calc_case_train_df)

# Fix the image paths in the calc_test_df dataframe
updated_path_mass(calc_case_test_df)

In [95]:
# Display the first few rows of the calc_case_train_df dataframe
calc_case_train_df.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00005,3,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
1,P_00005,3,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
2,P_00007,4,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
3,P_00007,4,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...
4,P_00008,1,LEFT,CC,1,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...


In [96]:
# Display the first few rows of the calc_case_test_data dataframe
calc_case_test_df.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00038,2,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....
3,P_00038,2,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....
4,P_00038,2,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,/content/drive/MyDrive/jpeg/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...


#Data Cleaning

In [97]:
# check unique values in pathology column
mass_train_df.pathology.unique()

array(['MALIGNANT', 'BENIGN', 'BENIGN_WITHOUT_CALLBACK'], dtype=object)

In [98]:
# check unique values in pathology column
calc_case_train_df.pathology.unique()

array(['MALIGNANT', 'BENIGN', 'BENIGN_WITHOUT_CALLBACK'], dtype=object)

In [99]:
# check unique values in pathology column
calc_case_train_df.pathology.unique()

array(['MALIGNANT', 'BENIGN', 'BENIGN_WITHOUT_CALLBACK'], dtype=object)

In [100]:
# Display the summary information of the mass_train_df DataFrame
# This includes details such as the number of entries, column names, non-null counts, and data types.
mass_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1318 entries, 0 to 1317
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   patient_id               1318 non-null   object
 1   breast_density           1318 non-null   int64 
 2   left or right breast     1318 non-null   object
 3   image view               1318 non-null   object
 4   abnormality id           1318 non-null   int64 
 5   abnormality type         1318 non-null   object
 6   mass shape               1314 non-null   object
 7   mass margins             1275 non-null   object
 8   assessment               1318 non-null   int64 
 9   pathology                1318 non-null   object
 10  subtlety                 1318 non-null   int64 
 11  image file path          1318 non-null   object
 12  cropped image file path  1318 non-null   object
 13  ROI mask file path       1318 non-null   object
dtypes: int64(4), object(10)
memory usage: 14

In [101]:
# Display the summary information of the calc_case_train_df DataFrame
# This includes details such as the number of entries, column names, non-null counts, and data types.
calc_case_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1546 entries, 0 to 1545
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   patient_id               1546 non-null   object
 1   breast density           1546 non-null   int64 
 2   left or right breast     1546 non-null   object
 3   image view               1546 non-null   object
 4   abnormality id           1546 non-null   int64 
 5   abnormality type         1546 non-null   object
 6   calc type                1526 non-null   object
 7   calc distribution        1170 non-null   object
 8   assessment               1546 non-null   int64 
 9   pathology                1546 non-null   object
 10  subtlety                 1546 non-null   int64 
 11  image file path          1546 non-null   object
 12  cropped image file path  1546 non-null   object
 13  ROI mask file path       1546 non-null   object
dtypes: int64(4), object(10)
memory usage: 16

In [102]:
# Rename columns for better readability and consistency
mass_train_df = mass_train_df.rename(columns={
    'left or right breast': 'left_or_right_breast',  # Renaming for easier access
    'image view': 'image_view',  # Standardizing column names
    'abnormality id': 'abnormality_id',  # Making column names more descriptive
    'abnormality type': 'abnormality_type',  # Consistent naming convention
    'mass shape': 'mass_shape',  # Simplifying column names
    'mass margins': 'mass_margins',  # Improving column readability
    'image file path': 'image_file_path',  # Clearer path description
    'cropped image file path': 'cropped_image_file_path',  # Consistent naming
    'ROI mask file path': 'ROI_mask_file_path'  # Clear and consistent naming
})

# Display the first few rows of the renamed dataframe to verify changes
print(mass_train_df.head())

  patient_id  breast_density left_or_right_breast image_view  abnormality_id  \
0    P_00001               3                 LEFT         CC               1   
1    P_00001               3                 LEFT        MLO               1   
2    P_00004               3                 LEFT         CC               1   
3    P_00004               3                 LEFT        MLO               1   
4    P_00004               3                RIGHT        MLO               1   

  abnormality_type                          mass_shape   mass_margins  \
0             mass  IRREGULAR-ARCHITECTURAL_DISTORTION     SPICULATED   
1             mass  IRREGULAR-ARCHITECTURAL_DISTORTION     SPICULATED   
2             mass            ARCHITECTURAL_DISTORTION    ILL_DEFINED   
3             mass            ARCHITECTURAL_DISTORTION    ILL_DEFINED   
4             mass                                OVAL  CIRCUMSCRIBED   

   assessment  pathology  subtlety  \
0           4  MALIGNANT         4   
1   

In [103]:
# Rename columns in the calc_train_data DataFrame for consistency and readability
calc_case_train_df = calc_case_train_df.rename(columns={
    'left or right breast': 'left_or_right_breast',  # Renaming column for breast side information
    'breast density': 'breast_density',  # Renaming column for breast density information
    'image view': 'image_view',  # Renaming column for image view information
    'abnormality id': 'abnormality_id',  # Renaming column for abnormality ID
    'abnormality type': 'abnormality_type',  # Renaming column for type of abnormality
    'calc type': 'calc_type',  # Renaming column for calcification type
    'calc distribution': 'calc_distribution',  # Renaming column for calcification distribution
    'image file path': 'image_file_path',  # Renaming column for image file path
    'cropped image file path': 'cropped_image_file_path',  # Renaming column for cropped image file path
    'ROI mask file path': 'ROI_mask_file_path'  # Renaming column for ROI mask file path
})

# Print the first few rows of the renamed DataFrame to verify changes
print(calc_case_train_df.head())

  patient_id  breast_density left_or_right_breast image_view  abnormality_id  \
0    P_00005               3                RIGHT         CC               1   
1    P_00005               3                RIGHT        MLO               1   
2    P_00007               4                 LEFT         CC               1   
3    P_00007               4                 LEFT        MLO               1   
4    P_00008               1                 LEFT         CC               1   

  abnormality_type    calc_type calc_distribution  assessment  \
0    calcification    AMORPHOUS         CLUSTERED           3   
1    calcification    AMORPHOUS         CLUSTERED           3   
2    calcification  PLEOMORPHIC            LINEAR           4   
3    calcification  PLEOMORPHIC            LINEAR           4   
4    calcification          NaN          REGIONAL           2   

                 pathology  subtlety  \
0                MALIGNANT         3   
1                MALIGNANT         3   
2       

In [104]:
# check for null values
mass_train_df.isnull().sum()

patient_id                  0
breast_density              0
left_or_right_breast        0
image_view                  0
abnormality_id              0
abnormality_type            0
mass_shape                  4
mass_margins               43
assessment                  0
pathology                   0
subtlety                    0
image_file_path             0
cropped_image_file_path     0
ROI_mask_file_path          0
dtype: int64

In [105]:
# check for null values
calc_case_train_df.isnull().sum()

patient_id                   0
breast_density               0
left_or_right_breast         0
image_view                   0
abnormality_id               0
abnormality_type             0
calc_type                   20
calc_distribution          376
assessment                   0
pathology                    0
subtlety                     0
image_file_path              0
cropped_image_file_path      0
ROI_mask_file_path           0
dtype: int64

In [106]:
# fill in missing values using the backwards fill method
mass_train_df['mass_shape'] = mass_train_df['mass_shape'].fillna(method='bfill')
mass_train_df['mass_margins'] = mass_train_df['mass_margins'].fillna(method='bfill')

#check null values
mass_train_df.isnull().sum()

patient_id                 0
breast_density             0
left_or_right_breast       0
image_view                 0
abnormality_id             0
abnormality_type           0
mass_shape                 0
mass_margins               0
assessment                 0
pathology                  0
subtlety                   0
image_file_path            0
cropped_image_file_path    0
ROI_mask_file_path         0
dtype: int64

In [107]:
# fill in missing values using the backwards fill method
calc_case_train_df['calc_type'] = calc_case_train_df['calc_type'].fillna(method='bfill')
calc_case_train_df['calc_distribution'] = calc_case_train_df['calc_distribution'].fillna(method='bfill')

#check null values
calc_case_train_df.isnull().sum()

patient_id                 0
breast_density             0
left_or_right_breast       0
image_view                 0
abnormality_id             0
abnormality_type           0
calc_type                  0
calc_distribution          0
assessment                 0
pathology                  0
subtlety                   0
image_file_path            0
cropped_image_file_path    0
ROI_mask_file_path         0
dtype: int64

In [108]:
#check null values
mass_test_df.isnull().sum()

patient_id                  0
breast_density              0
left or right breast        0
image view                  0
abnormality id              0
abnormality type            0
mass shape                  0
mass margins               17
assessment                  0
pathology                   0
subtlety                    0
image file path             0
cropped image file path     0
ROI mask file path          0
dtype: int64

In [109]:
# Display column names of the mass_test_data DataFrame
print(mass_test_df.columns)
print('\n')

# Rename columns in mass_test_data to more Python-friendly names
mass_test_df = mass_test_df.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

# Display the updated column names in the mass_test DataFrame
print(mass_test_df.columns)

Index(['patient_id', 'breast_density', 'left or right breast', 'image view',
       'abnormality id', 'abnormality type', 'mass shape', 'mass margins',
       'assessment', 'pathology', 'subtlety', 'image file path',
       'cropped image file path', 'ROI mask file path'],
      dtype='object')


Index(['patient_id', 'breast_density', 'left_or_right_breast', 'image_view',
       'abnormality_id', 'abnormality_type', 'mass_shape', 'mass_margins',
       'assessment', 'pathology', 'subtlety', 'image_file_path',
       'cropped_image_file_path', 'ROI_mask_file_path'],
      dtype='object')


In [110]:
# Fill in missing values using the backward fill method for specified columns
# This method propagates the next values backward to fill missing entries
mass_test_df['mass_margins'] = mass_test_df['mass_margins'].fillna(method='bfill')

# Check for remaining null values in the dataset and display the count of nulls for each column
mass_test_df.isnull().sum()


patient_id                 0
breast_density             0
left_or_right_breast       0
image_view                 0
abnormality_id             0
abnormality_type           0
mass_shape                 0
mass_margins               0
assessment                 0
pathology                  0
subtlety                   0
image_file_path            0
cropped_image_file_path    0
ROI_mask_file_path         0
dtype: int64

In [111]:
# Display the column names in the 'calc_case_test_data' DataFrame
print(calc_case_test_df.columns)
print('\n')

# Rename the columns in the 'calc_case_test_data' DataFrame for consistency and readability
calc_case_test_df = calc_case_test_df.rename(columns={
    'left or right breast': 'left_or_right_breast',
    'breast density': 'breast_density',
    'image view': 'image_view',
    'abnormality id': 'abnormality_id',
    'abnormality type': 'abnormality_type',
    'calc type': 'calc_type',
    'calc distribution': 'calc_distribution',
    'image file path': 'image_file_path',
    'cropped image file path': 'cropped_image_file_path',
    'ROI mask file path': 'ROI_mask_file_path'
})

# Display the column names after renaming
print(calc_case_test_df.columns)


Index(['patient_id', 'breast density', 'left or right breast', 'image view',
       'abnormality id', 'abnormality type', 'calc type', 'calc distribution',
       'assessment', 'pathology', 'subtlety', 'image file path',
       'cropped image file path', 'ROI mask file path'],
      dtype='object')


Index(['patient_id', 'breast_density', 'left_or_right_breast', 'image_view',
       'abnormality_id', 'abnormality_type', 'calc_type', 'calc_distribution',
       'assessment', 'pathology', 'subtlety', 'image_file_path',
       'cropped_image_file_path', 'ROI_mask_file_path'],
      dtype='object')


In [112]:
#check null values
calc_case_test_df.isnull().sum()

patient_id                  0
breast_density              0
left_or_right_breast        0
image_view                  0
abnormality_id              0
abnormality_type            0
calc_type                   4
calc_distribution          63
assessment                  0
pathology                   0
subtlety                    0
image_file_path             0
cropped_image_file_path     0
ROI_mask_file_path          0
dtype: int64

In [113]:
# Fill in missing values using the backward fill method for specified columns
# This method propagates the next values backward to fill missing entries
calc_case_test_df['calc_type'] = calc_case_test_df['calc_type'].fillna(method='bfill')
calc_case_test_df['calc_distribution'] = calc_case_test_df['calc_distribution'].fillna(method='bfill')

# Check for remaining null values in the dataset and display the count of nulls for each column
calc_case_test_df.isnull().sum()


patient_id                 0
breast_density             0
left_or_right_breast       0
image_view                 0
abnormality_id             0
abnormality_type           0
calc_type                  0
calc_distribution          0
assessment                 0
pathology                  0
subtlety                   0
image_file_path            0
cropped_image_file_path    0
ROI_mask_file_path         0
dtype: int64