**About** : This notebook is used to prepare the data.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.linear_model import *

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [None]:
from util.plots import *
from params import *
from data.preparation import load_record, get_false_color_img

## Data

In [None]:
BASE_DIR = '../input/train/'

N_TIMES_BEFORE = 4

record_id = '1704010292581573769'

In [None]:
bands, masks = load_record(record_id, folder="../input/train/")

In [None]:
masks.keys(), bands.keys()

In [None]:
mask = masks['human_pixel_masks']

In [None]:
bands, masks = load_record(record_id, folder="../input/train/")

false_color = get_false_color_img(bands)

plot_sample(false_color[..., 4], mask)

## Loop

In [None]:
folders = glob.glob(DATA_PATH + "train/*") + glob.glob(DATA_PATH + "validation/*")

In [None]:
SAVE = True
PLOT = False

In [None]:
SAVE_FOLDER = "../input/false_color/"
if SAVE:
    os.makedirs(SAVE_FOLDER, exist_ok=True)
    os.makedirs(SAVE_FOLDER + "img/", exist_ok=True)
    os.makedirs(SAVE_FOLDER + "mask/", exist_ok=True)

In [None]:
metadata = []
for i, folder in enumerate(tqdm(folders)):
    record_id = folder.split('/')[-1]
    img_path = SAVE_FOLDER + "img/" + record_id + ".png"
    mask_path = SAVE_FOLDER + "mask/" + record_id + ".png"

    bands, masks = load_record(folder, folder="")

    
    false_color = get_false_color_img(bands)
    img = false_color[..., 4]
    
    h, w = img.shape[:2]
    mask = masks['human_pixel_masks']

    if PLOT or not ((i + 1) % 1000):
        plot_sample(img, mask)

    if SAVE:
        cv2.imwrite(img_path, (img * 255).astype(np.uint8))
        cv2.imwrite(mask_path, mask.astype(np.uint8))
    
    metadata.append({
        "record_id": record_id,
        "folder": folder + "/",
        "h": h,
        "w": w,
        "has_contrail": mask.sum() > 0,
        "img_path": img_path,
        "mask_path": mask_path,
    })

    # if i > 20:
    #     break

In [None]:
df = pd.DataFrame(metadata)
df.to_csv(SAVE_FOLDER + "df.csv", index=False)
df.head()

In [None]:
len(df), len(os.listdir(SAVE_FOLDER + "mask/")), len(os.listdir(SAVE_FOLDER + "img/"))

Done ! 