In [1]:
import os
from os import path
from glob import glob

import cv2
import numpy as np
import pandas as pd
os.chdir('../')
from lib.csaw_utils import get_exam_level_meta, get_patient_level_meta 
from lib.preprocess_utils import read_resize_img,segment_breast,crop_img,add_img_margins,get_max_connected_area
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
csaw_data = pd.read_csv('demo/anon_dataset_nonhidden_211125.csv', nrows=100)
train_set, test_set = train_test_split(csaw_data, test_size= 0.1, random_state=32)
train_set, val_set = train_test_split(train_set, test_size= 0.1, random_state=32)

In [6]:
train_set.to_csv('/mnt/h/datasets/csaw_train.csv',index=False)
val_set.to_csv('/mnt/h/datasets/csaw_val.csv',index=False)
test_set.to_csv('/mnt/h/datasets/csaw_test.csv',index=False)

In [2]:
CSAW_DIR = '/mnt/nas4/diskl/MMG/Data/MMG-R1/CSAW_1'
MASK_DIR =  CSAW_DIR + '/anon_annotations_nonhidden'
META_FILE = 'anon_dataset_nonhidden_211125.csv'

In [3]:
csaw_meta = pd.read_csv(path.join(CSAW_DIR, META_FILE))

In [4]:
cancer_images = csaw_meta[csaw_meta.x_case == 1]
cancer_patients = csaw_meta[csaw_meta.x_case == 1].anon_patientid.unique()

In [None]:
import csv
from tqdm import tqdm
fields=['patient,img_file,mask_file']
with open(os.path.join(CSAW_DIR,'csaw_dataset/','cancer_meta.csv'), 'a') as f:
    writer = csv.writer(f)
    # writer.writerow(fields)
    with tqdm(total=len(cancer_images)) as pbar:
        for patient in cancer_patients:
            patient_imgs = csaw_meta[csaw_meta.anon_patientid == patient]
            for img_file in patient_imgs.anon_filename.tolist():
                full_img_pathes = glob(CSAW_DIR+'/*/'+img_file)
                if len(full_img_path)==0 or path.exists(os.path.join(CSAW_DIR,'csaw_dataset/',img_file[:-4]+'.png')):
                    continue
                full_img_path = full_img_pathes[0]
                mask_img_pathes = glob(MASK_DIR+'/'+img_file[:-4]+'_mask.png')
                full_img = read_resize_img(full_img_path, (2000,3000),gs_255=False)
                full_img_segment,bbox = segment_breast(full_img,erosion=False)
                full_img_segment = cv2.resize(full_img_segment.astype(np.uint16),(2000,3000))
                cv2.imwrite(os.path.join(CSAW_DIR,'csaw_dataset/',img_file[:-4]+'.png'),full_img_segment)
                
            
                if len(mask_img_pathes) == 0:
                    writer.writerow([patient,img_file[:-4]+'.png','NULL'])
                else:
                    mask_img_path = mask_img_pathes[0]
                    mask_img =read_resize_img(mask_img_path, (2000,3000),gs_255=True)
                    mask_img = crop_img(mask_img,bbox)
                    mask_img = cv2.resize(mask_img,(2000,3000))
                    cv2.imwrite(os.path.join(CSAW_DIR,'csaw_dataset/',img_file[:-4]+'_mask.png'),mask_img)
                    writer.writerow([patient,img_file[:-4]+'.png',img_file[:-4]+'_mask.png'])
                pbar.update(1)

In [7]:
none_cancer_images = csaw_meta[csaw_meta.x_case == 0]
none_cancer_patients = csaw_meta[csaw_meta.x_case == 0].anon_patientid.unique()

In [8]:
import csv
from tqdm import tqdm
with open(os.path.join(CSAW_DIR,'csaw_dataset/','cancer_meta.csv'), 'a') as f:
    writer = csv.writer(f)
    # writer.writerow(fields)
    with tqdm(total=200) as pbar:
        count = 0
        for patient in none_cancer_patients:
            patient_imgs = csaw_meta[csaw_meta.anon_patientid == patient]
            for img_file in patient_imgs.anon_filename.tolist():
                full_img_pathes = glob(CSAW_DIR+'/*/'+img_file)
                if len(full_img_path)==0 or path.exists(os.path.join(CSAW_DIR,'csaw_dataset/',img_file[:-4]+'.png')):
                    continue
                full_img_path = full_img_pathes[0]
                mask_img_pathes = glob(MASK_DIR+'/'+img_file[:-4]+'_mask.png')
                full_img = read_resize_img(full_img_path, (2000,3000),gs_255=False)
                full_img_segment,bbox = segment_breast(full_img,erosion=False)
                full_img_segment = cv2.resize(full_img_segment.astype(np.uint16),(2000,3000))
                cv2.imwrite(os.path.join(CSAW_DIR,'csaw_dataset/',img_file[:-4]+'.png'),full_img_segment)
                
            
                if len(mask_img_pathes) == 0:
                    writer.writerow([patient,img_file[:-4]+'.png','NULL'])
                else:
                    mask_img_path = mask_img_pathes[0]
                    mask_img =read_resize_img(mask_img_path, (2000,3000),gs_255=True)
                    mask_img = crop_img(mask_img,bbox)
                    mask_img = cv2.resize(mask_img,(2000,3000))
                    cv2.imwrite(os.path.join(CSAW_DIR,'csaw_dataset/',img_file[:-4]+'_mask.png'),mask_img)
                    writer.writerow([patient,img_file[:-4]+'.png',img_file[:-4]+'_mask.png'])
            count += 1
            if count == 200:
                break
            pbar.update(1)

100%|█████████▉| 199/200 [39:20<00:11, 11.86s/it]


In [9]:
images = glob('/mnt/nas4/diskl/MMG/Data/MMG-R1/CSAW_1/csaw_dataset/*.png')

In [12]:
with tqdm(total=len(images)) as pbar:
    for img_f in images:
        img = cv2.imread(img_f, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(
            img, dsize=(512, 1024), 
            interpolation=cv2.INTER_CUBIC)
        cv2.imwrite(img_f,img)
        pbar.update(1)

100%|██████████| 10317/10317 [24:04<00:00,  7.14it/s]
