In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**FILES**

**[train/test]_images/[patient_id]/[image_id].dcm** The mammograms, in dicom format. You can expect roughly 8,000 patients in the hidden test set. There are usually but not always 4 images per patient. Note that many of the images use the jpeg 2000 format which may you may need special libraries to load.

**sample_submission.csv** A valid sample submission. Only the first few rows are available for download.

**[train/test].csv** Metadata for each patient and image. Only the first few rows of the test set are available for download.

# Install Libraries

In [5]:
!pip install -qU python-gdcm pydicom pylibjpeg

[0m

# Imports

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from cv2 import resize
from numpy import amax, uint8
from pathlib import Path
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import apply_voi_lut as avl
from skimage import exposure as ex
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

# Data Description
**site_id** - ID code for the source hospital.\
**patient_id** - ID code for the patient.\
**image_id** - ID code for the image.\
**laterality** - Whether the image is of the left or right breast.\
**view** - The orientation of the image. The default for a screening exam is to capture two views per breast.\
**age** - The patient's age in years.\
**implant** - Whether or not the patient had breast implants. Site 1 only provides breast implant information at the patient level, not at the breast level.\
**density** - A rating for how dense the breast tissue is, with A being the least dense and D being the most dense. Extremely dense tissue can make diagnosis more difficult. Only provided for train.\
**machine_id** - An ID code for the imaging device.\
**cancer** - Whether or not the breast was positive for cancer. The target value. Only provided for train.\
**biopsy** - Whether or not a follow-up biopsy was performed on the breast. Only provided for train.\
**invasive** - If the breast is positive for cancer, whether or not the cancer proved to be invasive. Only provided for train.\
**BIRADS** - 0 if the breast required follow-up, 1 if the breast was rated as negative for cancer, and 2 if the breast was rated as normal. Only provided for train.\
**prediction_id** - The ID for the matching submission row. Multiple images will share the same prediction ID. Test only.\
**difficult_negative_case** - True if the case was unusually difficult. Only provided for train.

# Loading Data

In [7]:
train = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')
test = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')
sample = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/sample_submission.csv')

In [8]:
train

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True


In [9]:
test

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,implant,machine_id,prediction_id
0,2,10008,736471439,L,MLO,81,0,21,10008_L
1,2,10008,1591370361,L,CC,81,0,21,10008_L
2,2,10008,68070693,R,MLO,81,0,21,10008_R
3,2,10008,361203119,R,CC,81,0,21,10008_R


In [10]:
sample

Unnamed: 0,prediction_id,cancer
0,10008_L,0.021168
1,10008_R,0.021168


In [11]:
print("Features:", train.columns)
print()
print("No. of features in training dataset:", len(train.columns))

Features: Index(['site_id', 'patient_id', 'image_id', 'laterality', 'view', 'age',
       'cancer', 'biopsy', 'invasive', 'BIRADS', 'implant', 'density',
       'machine_id', 'difficult_negative_case'],
      dtype='object')

No. of features in training dataset: 14


In [12]:
train.nunique()

site_id                        2
patient_id                 11913
image_id                   54706
laterality                     2
view                           6
age                           63
cancer                         2
biopsy                         2
invasive                       2
BIRADS                         3
implant                        2
density                        4
machine_id                    10
difficult_negative_case        2
dtype: int64

In [15]:
test.nunique()

site_id          1
patient_id       1
image_id         4
laterality       2
view             2
age              1
implant          1
machine_id       1
prediction_id    2
dtype: int64

# Insights from data
- There are a total of 54,706 unique training images each having 14 features.
- There are only 2 training sites where imaging took place.
- There are 11,913 unique patients out of 54,706 training samples, meaning that patients are represented multiple times.
- There are 10 different machines that performed imaging during training.
- There are 4 testing samples to work with, each containing 9 features.

- There are 5 features in the training set that don't appear in the testing set. There may be an opportunity here to use soft labeling to predict those features that don't appear in the testing set to help us with the classification task, if those features end up being at least partially correlated with the target variable. In particular, we may want to see if we can build classifiers that filter for:
  -  biopsy
  - invasive
  - BIRADS
  - density
  - difficult_negative_case - this in particular may be a good starting point for generating a coarse first-pass filter

- Patients are represented multiple times. This makes sense because patients have a L (left) and R (right) laterality that we are looking at. However, what we need to see is whether or not there is disagreement between lateriality findings for the same patient, on the same side. In other words, for the same patient on the same side, if there are multiple images, can we have a finding of cancer in one image, but no finding in another image? If so, why?

- Given there are 10 different machines that performed imaging, we should check and see if there is any bias or skew between a machine used and whether it is a stronger or weaker predictor for cancer.

# Acknowledgements - 
https://www.kaggle.com/code/craigmthomas/rsna-2022-eda