# Create Metadataset
This notebook creates a metadataset from the raw data. The metadataset contains the following columns:
- percentage clouds
- percentage land
- percentage missing landsat data
- percentage kelp (if labeled)

The metadataset is saved in `../data/processed/metadata.csv`

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import rasterio
from tqdm.notebook import tqdm

Start by reading the raw metadataset

In [3]:
metadata = pd.read_csv('../data/raw/metadata_fTq0l2T.csv')
metadata.head()

Unnamed: 0,tile_id,filename,md5_hash,filesize_bytes,type,in_train
0,JW725114,JW725114_satellite.tif,97b19f0747260df89e23f33caced3632,1105392,satellite,True
1,UX493605,UX493605_satellite.tif,fbdd888f115ede68ee83996071c007dc,1270901,satellite,True
2,OU500661,OU500661_satellite.tif,2003e7d0eaf10894e796721b5d344eeb,1238008,satellite,True
3,DC227980,DC227980_satellite.tif,8498ccaff72b8d7cfcc7f9404baf36f2,1252483,satellite,True
4,SS602790,SS602790_satellite.tif,525618743f94a065158f432bd5ee2d32,1535964,satellite,True


In [4]:
metadata = metadata[metadata['type'] == 'satellite']

In [5]:
def process_image(tile_id: str, filename: str, in_train: bool) -> pd.Series:
    folder = Path('../data/raw/train_satellite' if in_train else '../data/raw/test_satellite')
    file = folder / filename
    with rasterio.open(file) as src:
        img = src.read()
    imsize = img.shape[1] * img.shape[2]
    img.reshape(img.shape[0], -1)

    cloud = np.sum(img[5, :, :]) / imsize
    land = np.sum(img[6, :, :] > 0) / imsize
    missing_landsat = np.any(img[0:5, :, :] < 0, axis=0)
    missing_landsat = np.sum(missing_landsat) / imsize

    kelp = None
    if in_train:
        kelp_file = Path('../data/raw/train_kelp') / f'{tile_id}_kelp.tif'
        with rasterio.open(kelp_file) as src:
            kelp = src.read()
        kelp = kelp.reshape(-1)
        kelp = np.sum(kelp) / imsize

    return pd.Series([tile_id, cloud, land, missing_landsat, kelp, in_train],
                     index=['tile_id', 'cloud', 'land', 'missing_landsat', 'kelp', 'in_train'])

In [6]:
# Do it without appending
tqdm.pandas()
df = metadata.progress_apply(lambda row: process_image(row['tile_id'], row['filename'], row['in_train']), axis=1)

  0%|          | 0/7061 [00:00<?, ?it/s]

  dataset = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


In [7]:
df.to_csv('../data/processed/metadata.csv', index=False)