# Data Selection

In [None]:
import pandas as pd
import numpy as np
import random
import data_selection_utils as utils

## Normal Mamografies

In [None]:
bcdrN = pd.read_csv('d:/BCDR/BCDR-DN01_dataset/bcdr_dn01_img.csv')
print('Normal Dataset: ',bcdrN.shape)
bcdrN.head(10)

In [None]:
normal_mammographies = pd.DataFrame({})
normal_mammographies[['patient_id','image_view','image_path']] = bcdrN[['patient_id','image_type_name','image_filename']]
normal_mammographies = utils.fix_bcdr_path(normal_mammographies,'image_path')

## Lesion Mamografies

In [None]:
masses_training_raw = pd.read_csv('d:/CBIS-DDSM/mass_case_description_train_set.csv')
masses_training_raw_1 = masses_training_raw.drop_duplicates(subset=['image file path'],keep='first',ignore_index=True)
masses_training = utils.fix_cbis_path(masses_training_raw_1,'image file path')
#print('Masses Training Data Size: ',masses_training_raw.shape)
masses_training.head(10)

In [None]:
image_view = [masses_training['left or right breast'][i][0]+masses_training['image view'][i] for i in range(len(masses_training))]
masses_training['image_view'] = image_view

In [None]:
lesion_mammographies = pd.DataFrame({})
lesion_mammographies[['patient_id','image_view','image_path']] = masses_training[['patient_id','image_view','image file path']]

## Patient w/ at least 4 views

### Normal Mammografies

In [None]:
def all_in(candidates, sequence):
    for element in sequence:
        if element not in candidates:
            return False
    return True

In [None]:
normal_df = pd.DataFrame()
groups = [df for _, df in normal_mammographies.groupby('patient_id')]
for df in groups:
    patient = list(df['patient_id'])[0]
    image_views = list(df['image_view'])
    if all_in(image_views,[' RO',' RCC',' LO',' LCC']):
        normal_df = pd.concat([normal_df,df],ignore_index=True)

In [None]:
normal_patient_list = list(normal_df['patient_id'])

In [None]:
normal_dict = {i:normal_patient_list.count(i) for i in normal_patient_list}
patients_to_erase = []
for patient,images in normal_dict.items():
    if images < 4:
        patients_to_erase.append(patient)
for patient in patients_to_erase:
    normal_dict.pop(patient)

In [None]:
print('Eligible Normal Patients: ',len(normal_dict.keys()))

In [None]:
normal_dataframe = normal_mammographies[normal_mammographies['patient_id'].isin(normal_dict.keys())]
normal_dataframe['label'] = 'Normal'

### Lesion Mamografies

In [None]:
lesion_patient_list = list(lesion_mammographies['patient_id'])

In [None]:
lesion_dict = {i:lesion_patient_list.count(i) for i in lesion_patient_list}
patients_to_erase = []
for patient,images in lesion_dict.items():
    if images < 4:
        patients_to_erase.append(patient)
for patient in patients_to_erase:
    lesion_dict.pop(patient)
print('Eligible Lesion Patients: ',len(lesion_dict.keys()))

In [None]:
lesion_dataframe = lesion_mammographies[lesion_mammographies['patient_id'].isin(lesion_dict.keys())]
lesion_dataframe['label'] = 'Suspicious'

## Copying Mammographies

In [None]:
dfs = [normal_dataframe,lesion_dataframe]
f_df = pd.concat(dfs, axis=0)

In [None]:
fdict = {}
fdict.update(normal_dict)
fdict.update(lesion_dict)

In [None]:
keys = list(fdict.keys())
random.shuffle(keys)
f_dict = dict()
for key in keys:
    f_dict[key] = fdict[key]

In [None]:
patient_list = utils.image_mover(f_df,f_dict)

### Save Ground Truth .csv

In [None]:
patient_name_df = pd.DataFrame({'patient':list(patient_list.keys()),'label':list(patient_list.values())})
patient_name_df.to_csv('D:/Architecture/patients/patient_gt.csv',index=False)

In [None]:
pd.read_csv('D:/Architecture/patients/patient_gt.csv')