#### Code to extract test data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import cv2
import glob
import gdcm
import pydicom
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from tqdm.notebook import tqdm

In [None]:
from params import *
from data.extraction import *

### Params

In [None]:
if not os.path.exists(DCM_PATH):
    DCM_PATH = DATA_PATH

In [None]:
SIZE = 512
DATABASE = "test"
SAVE_DIR = DATA_PATH + f"{DATABASE}_{SIZE}/"

if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)

In [None]:
sub = pd.read_csv(DATA_PATH + "sample_submission.csv")

In [None]:
def prepare_df_test(sub, root):
    stacks = glob.glob(root + "/*/*/*.dcm")
    
    img_ids = []
    study_ids = []

    for stack in stacks:
        study, _, img = stack.split('/')[-3:]
        img_ids.append(img[:-4] + "_image")
        study_ids.append(study)
    
    df = pd.DataFrame({"id": img_ids, "StudyInstanceUID": study_ids})
    
    sub = sub[sub['PredictionString'].apply(lambda x: "none" in x)].reset_index(drop=True)
    
    return sub.merge(df)

In [None]:
df = prepare_df_test(sub, os.path.join(DCM_PATH, DATABASE))

In [None]:
infos = []
for i in tqdm(range(len(df))):
    study = df['StudyInstanceUID'][i]
    image = df['id'][i].split('_')[0]

    study_path = os.path.join(DCM_PATH, DATABASE, study)
    series = os.listdir(study_path)
    
    found = False
    for s in series:
        img_path = os.path.join(study_path, s, image + ".dcm")
        if os.path.exists(img_path):
            found = True
            break
            
    if not found:
        print(f'Image {i} not found')
        continue
    
    img, meta = read_xray(img_path)
    shape = img.shape

    img, window = auto_windowing(img)

    img, crop_starts = remove_padding(img)
    shape_crop = img.shape

    if not (i % 500):
        plt.figure(figsize=(9, 9))
        plt.imshow(img, cmap="gray")
        plt.axis(False)
        plt.show()
        
    img = cv2.resize(img, (SIZE, SIZE), interpolation=cv2.INTER_LINEAR)
    
    save_name = f"{study}_{image}.png"
    cv2.imwrite(SAVE_DIR + save_name, img)

    info_dic = {
        "study_id": [study],
        "series_id": [s],
        "image_id": [image],
        "save_name": [save_name],
        "shape": [shape],
        "shape_crop": [shape_crop],
        "window": [window],
        "crop_starts": [crop_starts],
        "photometric_interpreation": [meta.PhotometricInterpretation],
        "series_number": [meta.SeriesNumber],
        "instance_number": [meta.InstanceNumber],
    }
    infos.append(pd.DataFrame.from_dict(info_dic))

#     break

infos = pd.concat(infos).reset_index(drop=True)
infos.to_csv(DATA_PATH + f'df_{DATABASE}_{SIZE}.csv', index=False)

In [None]:
infos.head()