### Breast Cancer Detection - Exploratory Data Analysis (EDA)

Download the mass case(train&test) csv data from this location: https://www.kaggle.com/datasets/awsaf49/cbis-ddsm-breast-cancer-image-dataset/data

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [44]:
# open the csv file and read it into a pandas dataframe to understand the data
df = pd.read_csv('https://raw.githubusercontent.com/Periyzat/ml-zoomcamp/refs/heads/main/midtermProject/data/mass_case_description_train_set.csv', sep=',', quotechar='"')


# set all the column names to lowercase
df.columns = map(str.lower, df.columns)

# preview the data
df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,roi mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,Mass-Training_P_00004_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....


In [40]:
# dropping unrelated columns
df.drop(['patient_id','image file path', 'cropped image file path', 'roi mask file path'], axis=1, inplace=True)
df = df.rename(columns={'left or right breast':'left_right'})
df = df.rename(columns={'image view':'image_view'})
df = df.rename(columns={'abnormality id':'abnormality_id'})
df = df.rename(columns={'abnormality type':'abnormality_type'})
df = df.rename(columns={'mass shape':'mass_shape'})
df = df.rename(columns={'mass margins':'mass_margins'})

df.head()

Unnamed: 0,breast_density,left_right,image_view,abnormality_id,abnormality_type,mass_shape,mass_margins,assessment,pathology,subtlety
0,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4
1,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4
2,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3
3,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3
4,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5


In [41]:
# find duplicates on the dataset
duplicate_rows_df = df[df.duplicated(keep=False)]
print('number of duplicate rows: ', duplicate_rows_df.shape)

# sort the rows by all columns
duplicate_rows_df = duplicate_rows_df.sort_values(by=list(df.columns))

# sample the dups
print(duplicate_rows_df.head(10))

# remove the duplicates keepinq the first occurrence
df = df.drop_duplicates(keep='first')

number of duplicate rows:  (472, 10)
      breast_density left_right image_view  abnormality_id abnormality_type  \
106                1       LEFT         CC               1             mass   
116                1       LEFT         CC               1             mass   
737                1       LEFT         CC               1             mass   
1236               1       LEFT         CC               1             mass   
296                1       LEFT         CC               1             mass   
606                1       LEFT         CC               1             mass   
595                1       LEFT         CC               1             mass   
872                1       LEFT         CC               1             mass   
953                1       LEFT         CC               1             mass   
1003               1       LEFT         CC               1             mass   

     mass_shape    mass_margins  assessment  pathology  subtlety  
106   IRREGULAR      SPICU

In [42]:
# display the columns and their data types
print(df.dtypes)
df.shape

breast_density       int64
left_right          object
image_view          object
abnormality_id       int64
abnormality_type    object
mass_shape          object
mass_margins        object
assessment           int64
pathology           object
subtlety             int64
dtype: object


(1022, 10)

In [9]:
df.isna().sum()

patient_id                 0
breast_density             0
left or right breast       0
image view                 0
abnormality id             0
abnormality type           0
mass shape                 0
mass margins               0
assessment                 0
pathology                  0
subtlety                   0
image file path            0
cropped image file path    0
roi mask file path         0
dtype: int64

In [16]:
df.subtlety.unique()

array([4, 3, 5, 1, 2, 0], dtype=int64)

In [19]:
df.subtlety.value_counts()

5    543
4    375
3    257
2    100
1     41
0      2
Name: subtlety, dtype: int64

In [36]:
df.patient_id.value_counts()

P_00106    14
P_01039    12
P_01103     6
P_00797     6
P_00207     6
           ..
P_01553     1
P_01299     1
P_00720     1
P_01559     1
P_01556     1
Name: patient_id, Length: 691, dtype: int64