**About** : This notebook is used to prepare the data.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.linear_model import *

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [None]:
from util.plots import *
from params import *
from data.preparation import load_record, get_false_color_img

## Data

In [None]:
BASE_DIR = '../input/train/'

N_TIMES_BEFORE = 4

record_id = '1704010292581573769'

In [None]:
bands, masks = load_record(record_id, folder="../input/train/")

In [None]:
masks.keys(), bands.keys()

In [None]:
mask = masks['human_pixel_masks']

In [None]:
bands, masks = load_record(record_id, folder="../input/train/")

false_color = get_false_color_img(bands)

plot_sample(false_color[..., 4], mask)

### Metadata

In [None]:
train_meta = json.load(open(DATA_PATH + "train_metadata.json"))
val_meta = json.load(open(DATA_PATH + "validation_metadata.json"))

In [None]:
df_meta = pd.concat([
    pd.DataFrame(train_meta),
    pd.DataFrame(val_meta),
], ignore_index=True)

In [None]:
df_meta['timestamp'][0]

In [None]:
ts = pd.Timestamp(1571008800* 1e9)

In [None]:
ts

In [None]:
ts.date()

In [None]:
df_meta['timestamp'] = df_meta['timestamp'].apply(lambda x: pd.Timestamp(x * 1e9))

In [None]:
df_meta['date'] = df_meta['timestamp'].apply(lambda x: x.date())
df_meta['hour'] = df_meta['timestamp'].apply(lambda x: x.hour)

In [None]:
df_meta

In [None]:
pd.Timestamp(df_meta['timestamp'][0])

In [None]:
df_dup = df_meta[df_meta.duplicated(subset=["projection_wkt", "row_min", "row_size", "col_min", "col_size", "date", "hour"], keep=False)].reset_index(drop=True)

In [None]:
df_dup[df_dup['row_min'] == df_dup['row_min'][0]]

In [None]:
record_id = "3029407526724839139"

bands, masks = load_record(record_id, folder="../input/train/")

mask = masks['human_pixel_masks']
false_color = get_false_color_img(bands)

plot_sample(false_color[..., 4], mask)

In [None]:
record_id = "1009826116837013927"

bands, masks = load_record(record_id, folder="../input/train/")

mask = masks['human_pixel_masks']
false_color = get_false_color_img(bands)

plot_sample(false_color[..., 4], mask)

### Reg

In [None]:
record_id = '1704010292581573769'

In [None]:
bands, masks = load_record(record_id, folder="../input/train/")
mask = masks['human_pixel_masks']

In [None]:
mask = masks['human_pixel_masks']

In [None]:
x = np.array(list(bands.values()))[..., 4].reshape(9, -1).T
x = (x - x.min(0, keepdims=True)) / (x.max(0, keepdims=True) - x.min(0, keepdims=True))

In [None]:
y = np.array(mask.flatten())

In [None]:
gens = []
coefs = []

In [None]:
model = LinearRegression()
model.fit(x, y)

coefs.append(model.coef_)
print("Coefs", dict(zip(bands.keys(), np.round(model.coef_, 4))))

gen = (x * model.coef_[None]).sum(-1)
gen = gen.reshape(256, 256)
gen = normalize_range(gen)
# 
gens.append(gen)

gen = np.array([gen, gen, gen]).transpose(1, 2, 0)

In [None]:
plot_sample(gen, mask)

In [None]:
model = Ridge(alpha=1, max_iter=10000)
model.fit(x, y)

coefs.append(model.coef_)
print("Coefs", dict(zip(bands.keys(), np.round(model.coef_, 4))))

gen = (x * model.coef_[None]).sum(-1)
gen = gen.reshape(256, 256)
gen = normalize_range(gen)

gens.append(gen)
gen = np.array([gen, gen, gen]).transpose(1, 2, 0)

In [None]:
plot_sample(gen, mask)

In [None]:
model = Lasso(alpha=0.001, max_iter=10000)
model.fit(x, y)

coefs.append(model.coef_)
print("Coefs", dict(zip(bands.keys(), np.round(model.coef_, 8))))

gen = (x * model.coef_[None]).sum(-1)
gen = gen.reshape(256, 256)
gen = normalize_range(gen)

gens.append(gen)
gen = np.array([gen, gen, gen]).transpose(1, 2, 0)

In [None]:
plot_sample(gen, mask)

In [None]:
for c in coefs:
    plt.scatter(bands.keys(), c, marker="x")
plt.show()

In [None]:
# np.save("../output/coefs.npy", np.array(coefs))

In [None]:
x = np.array(list(bands.values()))[..., 4].reshape(9, -1).T
x = (x - x.min(0, keepdims=True)) / (x.max(0, keepdims=True) - x.min(0, keepdims=True))

In [None]:
hw = int(np.sqrt(x.shape[0]))
gen_ = np.matmul(x, np.array(coefs).T) 
gen_ = (gen_ - gen_.min(0, keepdims=True)) / (gen_.max(0, keepdims=True) - gen_.min(0, keepdims=True))
gen_ = gen_.reshape(hw, hw, -1)

In [None]:
plot_sample(gen_, mask)

#### Inf

In [None]:
coefs = np.load("../output/coefs.npy")

In [None]:
for c in coefs:
    plt.scatter(bands.keys(), c, marker="x")
plt.show()

In [None]:
record_id = np.random.choice(os.listdir(BASE_DIR))
bands, masks = load_record(record_id, folder="../input/train/")
mask = masks['human_pixel_masks']

while not mask.sum():
    record_id = np.random.choice(os.listdir(BASE_DIR))
    bands, masks = load_record(record_id, folder="../input/train/")
    mask = masks['human_pixel_masks']

print(record_id)

In [None]:
def apply_coefs(bands, coefs):
    x = np.array(list(bands.values()))[..., 4].reshape(9, -1).T
    x = (x - x.min(0, keepdims=True)) / (x.max(0, keepdims=True) - x.min(0, keepdims=True))

    hw = int(np.sqrt(x.shape[0]))
    gen_ = np.matmul(x, np.array(coefs).T) 
    gen_ = (gen_ - gen_.min(0, keepdims=True)) / (gen_.max(0, keepdims=True) - gen_.min(0, keepdims=True))
    gen_ = gen_.reshape(hw, hw, -1)
    
    return gen_

In [None]:
gen_ = apply_coefs(bands, coefs)

In [None]:
# for i in range(3):
#     plot_sample(np.array([gen_[:, :, i], gen_[:, :, i], gen_[:, :, i]]).transpose(1, 2, 0), mask)

In [None]:
# plot_sample(gen_, mask)

In [None]:
# false_color = get_false_color_img(bands)
# false_color = false_color[..., 4]

# false_color = false_color.reshape(-1, 3)
# false_color = (false_color - false_color.min(0, keepdims=True)) / (false_color.max(0, keepdims=True) - false_color.min(0, keepdims=True))
# false_color = false_color.reshape(256, 256, 3)

# for i in range(3):
#     plot_sample(1 - np.array([false_color[:, :, i], false_color[:, :, i], false_color[:, :, i]]).transpose(1, 2, 0), mask)

In [None]:
# false_color = get_false_color_img(bands)

# plot_sample(false_color[..., 4], mask)

### Bands

In [None]:
# for k in bands.keys():
#     print(f"Band {k}")
#     img = normalize_range(bands[k][..., 4])
#     img = np.array([img, img, img]).transpose(1, 2, 0)

#     plot_sample(img, mask)
# #     break

## Loop

In [None]:
from params import *

In [None]:
folders = glob.glob(DATA_PATH + "train/*") + glob.glob(DATA_PATH + "validation/*")

In [None]:
SAVE = True
PLOT = False

In [None]:
SAVE_FOLDER = "../input/reg/"
if SAVE:
    os.makedirs(SAVE_FOLDER, exist_ok=True)
    os.makedirs(SAVE_FOLDER + "img/", exist_ok=True)
    os.makedirs(SAVE_FOLDER + "mask/", exist_ok=True)

In [None]:
metadata = []
for i, folder in enumerate(tqdm(folders)):
    record_id = folder.split('/')[-1]
    img_path = SAVE_FOLDER + "img/" + record_id + ".png"
    mask_path = SAVE_FOLDER + "mask/" + record_id + ".png"
    
    if os.path.exists(img_path) and os.path.exists(mask_path):
        mask = cv2.imread(mask_path, 0)[:, :, None]
        img = cv2.imread(img_path)
        h, w = img.shape[:2]
    else:
#         continue
        bands, masks = load_record(folder, folder="")
    
        img = apply_coefs(bands, coefs)
        
#         false_color = get_false_color_img(bands)
#         img = false_color[..., 4]
        
        h, w = img.shape[:2]
        mask = masks['human_pixel_masks']

        if PLOT or not ((i + 1) % 1000):
            plot_sample(img, mask)

        if SAVE:
            cv2.imwrite(img_path, (img * 255).astype(np.uint8))
            cv2.imwrite(mask_path, mask.astype(np.uint8))
    
#     img = cv2.imread(img_path)
#     mask = cv2.imread(mask_path, 0)[:, :, None]
#     plot_sample(img, mask)
    
    metadata.append({
        "record_id": record_id,
        "folder": folder + "/",
        "h": h,
        "w": w,
        "has_contrail": mask.sum() > 0,
        "img_path": img_path,
        "mask_path": mask_path,
    })

#     if i > 20:
#         break

In [None]:
df = pd.DataFrame(metadata)
df.to_csv(SAVE_FOLDER + "df.csv", index=False)
df.head()

In [None]:
len(df), len(os.listdir(SAVE_FOLDER + "mask/")), len(os.listdir(SAVE_FOLDER + "img/"))

In [None]:
# sns.histplot(df.h)

Done ! 