# Data Selection

In [None]:
import pandas as pd
import numpy as np
import random
import os
import data_selection_utils as utils

## BCDR-DN01 + INBreast (Normal Mamographies)

In [None]:
bcdrN = pd.read_csv('d:/BCDR/BCDR-DN01_dataset/bcdr_dn01_img.csv')
print('Normal Dataset: ',bcdrN.shape)
bcdrN.head(20)

In [None]:
normal_mammographies1 = pd.DataFrame({})
normal_mammographies1[['patient_id','image_view','image_path','density']] = bcdrN[['patient_id','image_type_name','image_filename','density']]
normal_mammographies1 = utils.fix_bcdrN_path(normal_mammographies1,'image_path')

In [None]:
xls_raw = pd.read_excel('D:/INBreast/INbreast.xls')
print('INbreast: ',xls_raw.shape)
xls = pd.DataFrame()
xls['image_view'] = [xls_raw['Laterality'][i] + xls_raw['View'][i] for i in xls_raw.index] 
xls[['filename','finding notes']] = xls_raw[['File Name','Findings Notes (in Portuguese)']]

In [None]:
path_list = os.listdir('D:/INBreast/AllDICOMs/')
r = []
for path in path_list:
    if path[-3:] != 'dcm':
        r.append(path)
for i in r:
    path_list.remove(i)

In [None]:
patients = []
file_paths = []
image_views = []
for path in path_list:
    l = path.split('_')
    if len(l) > 1:
        patients.append(l[1])
        file_paths.append(path)
        image_views.append(l[3]+l[4])
images_df = pd.DataFrame({'patient_id':patients,'image_view':image_views,'image_path':file_paths})
images_df = utils.fix_inbreast_path(images_df,'image_path')

In [None]:
images_df[['finding notes']] = xls[['finding notes']]
normal_df = images_df[images_df['finding notes'] == 'normal']
normal_mammographies2 = pd.DataFrame()
normal_mammographies2[['patient_id','image_view','image_path']] = normal_df[['patient_id','image_view','image_path']]

In [None]:
normal_mammographies = pd.concat([normal_mammographies1,normal_mammographies2],ignore_index=True)
print('Normal Dataset: ',normal_mammographies.shape)

## BCDR-D01 + BCDR-D02 (Lesion Mamographies)

In [None]:
bcdr1_raw = pd.read_csv('d:/BCDR/BCDR-D01_dataset/bcdr_d01_img.csv')
bcdr1_raw= utils.fix_view(bcdr1_raw,'image_type_name')
bcdr1_features_raw = pd.read_csv('D:\BCDR\BCDR-D01_dataset/bcdr_d01_features.csv')
bcdr2_raw = pd.read_csv('d:/BCDR/BCDR-D02_dataset/bcdr_d02_img.csv')
bcdr2_raw= utils.fix_view(bcdr2_raw,'image_type_name')
bcdr2_features_raw = pd.read_csv('D:\BCDR\BCDR-D02_dataset/bcdr_d02_features.csv')

In [None]:
bcdr1 = bcdr1_raw[['patient_id','study_id','image_filename','image_type_name','density']]
bcdr1_features= bcdr1_features_raw[['patient_id','study_id','image_view','s_x_center_mass','s_y_center_mass','density']]

bcdr1 = utils.merge_csv(bcdr1_features,bcdr1)

In [None]:
bcdr2 = bcdr2_raw[['patient_id','study_id','image_filename','image_type_name','density']]
bcdr2_features= bcdr2_features_raw[['patient_id','study_id','image_view','s_x_center_mass','s_y_center_mass','density']]

bcdr2 = utils.merge_csv(bcdr2_features,bcdr2)

In [None]:
lesion_mammographies1 = pd.DataFrame({})
lesion_mammographies1[['patient_id','image_view','image_path','x_center','y_center','density']] = bcdr1[['patient_id','image_view','image_filename','s_x_center_mass','s_y_center_mass','density']]
new_patients = []
for patient in list(lesion_mammographies1['patient_id']):
    patient_n= '1d'+str(patient)
    new_patients.append(patient_n)
lesion_mammographies1['patient_id'] = new_patients
lesion_mammographies1 = utils.fix_bcdr1_path(lesion_mammographies1,'image_path')

In [None]:
lesion_mammographies2 = pd.DataFrame({})
lesion_mammographies2[['patient_id','image_view','image_path','x_center','y_center','density']] = bcdr2[['patient_id','image_view','image_filename','s_x_center_mass','s_y_center_mass','density']]
new_patients = []
for patient in list(lesion_mammographies2['patient_id']):
    patient_n= '2d'+str(patient)
    new_patients.append(patient_n)
lesion_mammographies2['patient_id'] = new_patients
lesion_mammographies2 = utils.fix_bcdr2_path(lesion_mammographies2,'image_path')
lesion_mammographies2 = lesion_mammographies2[:52]

In [None]:
lesion_mammographies = pd.concat([lesion_mammographies1,lesion_mammographies2],ignore_index=True)
print('Suspicious Dataset: ',lesion_mammographies.shape)

In [None]:
lesion_mammographies = utils.fix_view_back(lesion_mammographies,'image_view')
lesion_mammographies.head(10)

## Patient w/ at least 4 views

### Normal Mammografies

In [None]:
normal_patient_list = list(normal_mammographies['patient_id'])

In [None]:
normal_dict = {i:normal_patient_list.count(i) for i in normal_patient_list}
patients_to_erase = []
for patient,images in normal_dict.items():
    if images < 2:
        patients_to_erase.append(patient)
for patient in patients_to_erase:
    normal_dict.pop(patient)

print('Eligible Normal Patients: ',len(normal_dict.keys()))

In [None]:
normal_dataframe = normal_mammographies[normal_mammographies['patient_id'].isin(normal_dict.keys())]
normal_dataframe['label'] = 'Normal'

### Lesion Mamografies

In [None]:
lesion_patient_list = list(lesion_mammographies['patient_id'])

In [None]:
lesion_dict = {i:lesion_patient_list.count(i) for i in lesion_patient_list}
patients_to_erase = []
for patient,images in lesion_dict.items():
    if images < 2:
        patients_to_erase.append(patient)
for patient in patients_to_erase:
    lesion_dict.pop(patient)
print('Eligible Lesion Patients: ',len(lesion_dict.keys()))

In [None]:
lesion_dataframe = lesion_mammographies[lesion_mammographies['patient_id'].isin(lesion_dict.keys())]
lesion_dataframe['label'] = 'Suspicious'

## Copying Mammographies

In [None]:
dfs = [normal_dataframe,lesion_dataframe]
f_df = pd.concat(dfs, axis=0,ignore_index=True)

In [None]:
fdict = {}
fdict.update(normal_dict)
fdict.update(lesion_dict)

In [None]:
keys = list(fdict.keys())
random.shuffle(keys)
f_dict = dict()
for key in keys:
    f_dict[key] = fdict[key]

In [None]:
patient_list,image_view_list,label_list,lesion_center_x,lesion_center_y,density_list = utils.image_mover(f_df,f_dict)

### Save Ground Truth .csv

In [None]:
patient_name_df = pd.DataFrame({'patient':patient_list,'label':label_list,'image view':image_view_list,'x_center':lesion_center_x,'y_center':lesion_center_y,'density':density_list})
patient_name_df.to_csv('D:/Architecture/patients/patient_gt.csv',index=False)

## CBIS-DDSM

In [None]:
masses_training_raw = pd.read_csv('d:/CBIS-DDSM/mass_case_description_train_set.csv')
masses_training_raw_1 = masses_training_raw.drop_duplicates(subset=['image file path'],keep='first',ignore_index=True)
masses_training = utils.fix_cbis_path(masses_training_raw_1,'image file path')
#print('Masses Training Data Size: ',masses_training_raw.shape)
masses_training.head(10)

In [None]:
image_view = [masses_training['left or right breast'][i][0]+masses_training['image view'][i] for i in range(len(masses_training))]
masses_training['image_view'] = image_view

In [None]:
lesion_mammographies = pd.DataFrame({})
lesion_mammographies[['patient_id','image_view','image_path']] = masses_training[['patient_id','image_view','image file path']]

In [None]:
lesion_patient_list = list(lesion_mammographies['patient_id'])

In [None]:
lesion_dict = {i:lesion_patient_list.count(i) for i in lesion_patient_list}
patients_to_erase = []
for patient,images in lesion_dict.items():
    if images < 4:
        patients_to_erase.append(patient)
for patient in patients_to_erase:
    lesion_dict.pop(patient)
print('Eligible Lesion Patients: ',len(lesion_dict.keys()))

In [None]:
lesion_dataframe = lesion_mammographies[lesion_mammographies['patient_id'].isin(lesion_dict.keys())]
lesion_dataframe['label'] = 'Suspicious'