In [1]:
# Install required libraries.
!pip install -q boto3 wandb

In [2]:
# Remove data folder if present.
!rm -r ./data

In [3]:
# Configuration of SEVIR data location.
BUCKET_NAME = 'sevir'
DATA_PATH    = '/content/data'
CATALOG_PATH = '/content/CATALOG.csv'
LOCAL_DIRECTORY = './'

# Set the random state for reproducibility.
RANDOM_STATE = 54

# Configuration of dataset sizes.
N_TRAIN = 200
N_VAL = 8
N_TEST_PER_EVENT_TYPE = 5

# Set the image types that each event should contain.
img_types = set(['vis', 'ir069', 'ir107', 'vil'])
# Set data scaling factor.
scaling_factors = { 'vis': 1e-4, 'vil': 1, 'ir069': 1e-2, 'ir107': 1e-2 }

In [4]:
import boto3
from botocore.handlers import disable_signing
import os
from typing import List

def sync_s3_bucket(
    bucket_name: str,
    local_directory: str,
    file_type_to_get: str
    ) -> None:
    s3 = boto3.resource('s3')
    s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)
    bucket = s3.Bucket(bucket_name)

    for obj in bucket.objects.all():
        # Generate the local file path by appending the object key to the local directory
        local_file_path = f"{local_directory}/{obj.key}"

        # Get just Random Events of 2018, Storm Events of 2019 and the Catalog.
        if file_type_to_get in obj.key:
          try:
              # Download the object to the local file path
              os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
              print(obj)
              bucket.download_file(obj.key, local_file_path)
          except:
              print(f'error: {obj}')

sync_s3_bucket(BUCKET_NAME, LOCAL_DIRECTORY, 'CATALOG')

s3.ObjectSummary(bucket_name='sevir', key='CATALOG.csv')


In [5]:
import h5py # needs conda/pip install h5py
import pandas as pd

# Read the catalog.
catalog = pd.read_csv(CATALOG_PATH, parse_dates=['time_utc'], low_memory=False)

In [6]:
print('Shape of the catalog:', catalog.shape)

Shape of the catalog: (76004, 21)


In [7]:
catalog.head(5)

Unnamed: 0,id,file_name,file_index,img_type,time_utc,minute_offsets,episode_id,event_id,event_type,llcrnrlat,...,urcrnrlat,urcrnrlon,proj,size_x,size_y,height_m,width_m,data_min,data_max,pct_missing
0,R18032505027684,vis/2018/SEVIR_VIS_RANDOMEVENTS_2018_0321_0331.h5,0,vis,2018-03-25 05:00:00,-120:-115:-110:-105:-100:-95:-90:-85:-80:-75:-...,,,,33.216708,...,36.336627,-87.070254,+proj=laea +lat_0=38 +lon_0=-98 +units=m +a=63...,768,768,384000.0,384000.0,-0.003361,0.0056,0.0
1,R18032505027677,vis/2018/SEVIR_VIS_RANDOMEVENTS_2018_0321_0331.h5,1,vis,2018-03-25 05:00:00,-120:-115:-110:-105:-100:-95:-90:-85:-80:-75:-...,,,,33.084309,...,36.213723,-87.301535,+proj=laea +lat_0=38 +lon_0=-98 +units=m +a=63...,768,768,384000.0,384000.0,-0.003361,0.0056,0.0
2,R18032505027721,vis/2018/SEVIR_VIS_RANDOMEVENTS_2018_0321_0331.h5,2,vis,2018-03-25 05:00:00,-120:-115:-110:-105:-100:-95:-90:-85:-80:-75:-...,,,,46.661866,...,50.883159,-120.009277,+proj=laea +lat_0=38 +lon_0=-98 +units=m +a=63...,768,768,384000.0,384000.0,-0.00287,0.005548,0.0
3,R18032516508249,vis/2018/SEVIR_VIS_RANDOMEVENTS_2018_0321_0331.h5,3,vis,2018-03-25 16:50:00,-120:-115:-110:-105:-100:-95:-90:-85:-80:-75:-...,,,,40.883237,...,43.686191,-79.903987,+proj=laea +lat_0=38 +lon_0=-98 +units=m +a=63...,768,768,384000.0,384000.0,0.029911,0.7856,0.0
4,R18032516507621,vis/2018/SEVIR_VIS_RANDOMEVENTS_2018_0321_0331.h5,4,vis,2018-03-25 16:50:00,-120:-115:-110:-105:-100:-95:-90:-85:-80:-75:-...,,,,44.946047,...,49.169436,-120.575175,+proj=laea +lat_0=38 +lon_0=-98 +units=m +a=63...,768,768,384000.0,384000.0,0.02856,0.79261,0.0


In [8]:
from typing import List, Optional, Set

def get_desired_event_ids(
    catalog: pd.DataFrame,
    img_types: Set[str],
    event_kind: str,
    event_type: Optional[str] = None
    ) -> List[str]:
    # Keep just the catalog entries of the desired event.
    catalog = catalog[catalog['file_name'].str.contains(event_kind)]
    # Filter by event type.
    if event_type:
      catalog = catalog[catalog.event_type == event_type]
    # Keep just the catalog entries with no missing values across the frames.
    catalog = catalog[catalog['pct_missing'] == 0]
    # Group by id and filter out the events that do not have all desired img_types.
    events = catalog.groupby('id').filter(
        lambda x: img_types.issubset(set(x['img_type']))).groupby('id')
    # Get the filtered id keys.
    return list(events.groups.keys())

In [9]:
# Get the event ids of kind RANDOMEVENTS_2018
event_ids = get_desired_event_ids(catalog, img_types, 'RANDOMEVENTS_2018')
print(f'Found {len(event_ids)} RANDOMEVENTS_2018 events matching: {", ".join(img_types)}')

Found 4552 RANDOMEVENTS_2018 events matching: ir069, vil, ir107, vis


In [10]:
from typing import List
import random

def get_sampled_events_ids(
    event_ids: List[str],
    n_samples: int,
    random_state: int) -> List[str]:
    # Start of the range.
    range_start = 0
    # End of the range.
    range_end = len(event_ids)

    # Get the random samples.
    random_generator = random.Random(random_state)
    random_samples = random_generator.sample(range(range_start, range_end), n_samples)

    # Assert that n_tot samples have been selected.
    assert len(pd.unique(random_samples)) == n_samples
    # Return the selected event ids.
    return [event_ids[idx] for idx in random_samples]


In [11]:
# Get total number of events for training and validation.
n_tot = N_TRAIN + N_VAL

# Get the sampled event ids.
sampled_events_ids = get_sampled_events_ids(event_ids, n_tot, RANDOM_STATE)

# Get the train and validation samples.
train_samples_ids = sampled_events_ids[:N_TRAIN]
val_samples_ids = sampled_events_ids[-N_VAL:]

# Assert that the train and validation samples are different.
assert not list(set(train_samples_ids) & set(val_samples_ids))

In [12]:
print(
    'Possible event types:',
    ', '.join([str(event) for event in catalog.event_type.unique()]),
    end='.\n')

Possible event types: nan, Tornado, Thunderstorm Wind, Hail, Funnel Cloud, Flash Flood, Heavy Rain, Flood, Lightning.


In [13]:
# Get possible event types.
event_types = [event for event in catalog.event_type.unique() if type(event) == str]

test_samples_ids = []

for event_type in event_types:
    # Get the event ids of kind STORMEVENTS_2019 by event type
    event_ids = get_desired_event_ids(catalog, img_types, 'STORMEVENTS_2019', event_type)
    print(f'Found {len(event_ids)} STORMEVENTS_2019 of type "{event_type}" events matching: {", ".join(img_types)}')

    # Get the sampled event ids.
    sampled_events_ids = get_sampled_events_ids(event_ids, N_TEST_PER_EVENT_TYPE, RANDOM_STATE)

    # Get the event ids of kind STORMEVENTS_2019 by event type
    test_samples_ids += sampled_events_ids

# Assert 5 events per type have been chosen.
assert len(set(test_samples_ids)) == N_TEST_PER_EVENT_TYPE * len(event_types)


Found 46 STORMEVENTS_2019 of type "Tornado" events matching: ir069, vil, ir107, vis
Found 387 STORMEVENTS_2019 of type "Thunderstorm Wind" events matching: ir069, vil, ir107, vis
Found 207 STORMEVENTS_2019 of type "Hail" events matching: ir069, vil, ir107, vis
Found 16 STORMEVENTS_2019 of type "Funnel Cloud" events matching: ir069, vil, ir107, vis
Found 58 STORMEVENTS_2019 of type "Flash Flood" events matching: ir069, vil, ir107, vis
Found 21 STORMEVENTS_2019 of type "Heavy Rain" events matching: ir069, vil, ir107, vis
Found 49 STORMEVENTS_2019 of type "Flood" events matching: ir069, vil, ir107, vis
Found 11 STORMEVENTS_2019 of type "Lightning" events matching: ir069, vil, ir107, vis


In [14]:
from typing import List
import numpy as np

def get_event_images(
    catalog: pd.DataFrame,
    event_ids: List[str],
    data_type: str,
    DATA_PATH: str
    ) -> np.ndarray:
    events_array = []
    for id in event_ids:
        # Get the row of the event.
        sample_event = catalog.loc[
            (catalog.id == id) & (catalog.img_type == data_type)].iloc[0]
        # Get file name and file index.
        file_name = sample_event.file_name
        file_index = sample_event.file_index
        event_type = sample_event.event_type

        with h5py.File(f'{DATA_PATH}/{file_name}','r') as hf:
            # Get the event images of the event and the data type.
            images = np.array(hf[data_type][file_index])
            # Scale the images by the defined scaling factor.
            images = images * scaling_factors[data_type]
            # Append the event images to the array of events with the event type if present.
            if type(event_type) == str:
                events_array.append((images, event_type))
            else:
                events_array.append(images)

    return np.array(events_array)

### Get train and Validation samples of `ir069`

> Blocco con rientro



In [15]:
import os
import numpy as np

!rm -r ./data

sync_s3_bucket(BUCKET_NAME, LOCAL_DIRECTORY, 'IR069_RANDOMEVENTS_2018')

!du -sh ./data

os.makedirs('./sevir', exist_ok=True)

# Get train samples.
ir069_train_samples = get_event_images(catalog, train_samples_ids, 'ir069', DATA_PATH)
np.save('./sevir/ir069_train_samples.npy', ir069_train_samples)

# Get validation samples.
ir069_val_samples = get_event_images(catalog, val_samples_ids, 'ir069', DATA_PATH)
np.save('./sevir/ir069_val_samples.npy', ir069_val_samples)

rm: cannot remove './data': No such file or directory
s3.ObjectSummary(bucket_name='sevir', key='data/ir069/2018/SEVIR_IR069_RANDOMEVENTS_2018_0101_0430.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir069/2018/SEVIR_IR069_RANDOMEVENTS_2018_0501_0831.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir069/2018/SEVIR_IR069_RANDOMEVENTS_2018_0901_1231.h5')
17G	./data


### Get train and Validation samples of `ir107`

In [16]:
import os
import numpy as np

!rm -r ./data

sync_s3_bucket(BUCKET_NAME, LOCAL_DIRECTORY, 'IR107_RANDOMEVENTS_2018')

!du -sh ./data

os.makedirs('./sevir', exist_ok=True)

# Get train samples.
ir107_train_samples = get_event_images(catalog, train_samples_ids, 'ir107', DATA_PATH)
np.save('./sevir/ir107_train_samples.npy', ir107_train_samples)

# Get validation samples.
ir107_val_samples = get_event_images(catalog, val_samples_ids, 'ir107', DATA_PATH)
np.save('./sevir/ir107_val_samples.npy', ir107_val_samples)

s3.ObjectSummary(bucket_name='sevir', key='data/ir107/2018/SEVIR_IR107_RANDOMEVENTS_2018_0101_0430.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir107/2018/SEVIR_IR107_RANDOMEVENTS_2018_0501_0831.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir107/2018/SEVIR_IR107_RANDOMEVENTS_2018_0901_1231.h5')
17G	./data


### Get train and Validation samples of `vil`

In [17]:
import os
import numpy as np

!rm -r ./data

sync_s3_bucket(BUCKET_NAME, LOCAL_DIRECTORY, 'VIL_RANDOMEVENTS_2018')

!du -sh ./data

os.makedirs('./sevir', exist_ok=True)

# Get train samples.
vil_train_samples = get_event_images(catalog, train_samples_ids, 'vil', DATA_PATH)
np.save('./sevir/vil_train_samples.npy', vil_train_samples)

# Get validation samples.
vil_val_samples = get_event_images(catalog, val_samples_ids, 'vil', DATA_PATH)
np.save('./sevir/vil_val_samples.npy', vil_val_samples)

s3.ObjectSummary(bucket_name='sevir', key='data/vil/2018/SEVIR_VIL_RANDOMEVENTS_2018_0101_0430.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vil/2018/SEVIR_VIL_RANDOMEVENTS_2018_0501_0831.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vil/2018/SEVIR_VIL_RANDOMEVENTS_2018_0901_1231.h5')
43G	./data


### Get train and Validation samples of `vis`

In [18]:
'''import os
import numpy as np

!rm -r ./data

sync_s3_bucket(BUCKET_NAME, LOCAL_DIRECTORY, 'VIS_RANDOMEVENTS_2018')

!du -sh ./data

os.makedirs('./sevir', exist_ok=True)

# Get train samples.
vis_train_samples = get_event_images(catalog, train_samples_ids, 'vis', DATA_PATH)
np.save('./sevir/vis_train_samples.npy', vis_train_samples)

# Get validation samples.
vis_val_samples = get_event_images(catalog, val_samples_ids, 'vis', DATA_PATH)
np.save('./sevir/vis_val_samples.npy', vis_val_samples)''';

In [19]:
#!du -sh ./data

In [20]:
'''import numpy as np

os.makedirs('./sevir', exist_ok=True)

# Get train samples.
ir069_train_samples = get_event_images(catalog, train_samples_ids, 'ir069', DATA_PATH)
np.save('./sevir/ir069_train_samples.npy', ir069_train_samples)

ir107_train_samples = get_event_images(catalog, train_samples_ids, 'ir107', DATA_PATH)
np.save('./sevir/ir107_train_samples.npy', ir107_train_samples)

vis_train_samples = get_event_images(catalog, train_samples_ids, 'vis', DATA_PATH)
np.save('./sevir/vis_train_samples.npy', vis_train_samples)

vil_train_samples = get_event_images(catalog, train_samples_ids, 'vil', DATA_PATH)
np.save('./sevir/vil_train_samples.npy', vil_train_samples)

# Get validation samples.
ir069_val_samples = get_event_images(catalog, val_samples_ids, 'ir069', DATA_PATH)
np.save('./sevir/ir069_val_samples.npy', ir069_val_samples)

ir107_val_samples = get_event_images(catalog, val_samples_ids, 'ir107', DATA_PATH)
np.save('./sevir/ir107_val_samples.npy', ir107_val_samples)

vis_val_samples = get_event_images(catalog, val_samples_ids, 'vis', DATA_PATH)
np.save('./sevir/vis_val_samples.npy', vis_val_samples)

vil_val_samples = get_event_images(catalog, val_samples_ids, 'vil', DATA_PATH)
np.save('./sevir/vil_val_samples.npy', vil_val_samples)''';

In [21]:
!rm -r ./data

In [22]:
sync_s3_bucket(BUCKET_NAME, LOCAL_DIRECTORY, 'STORMEVENTS_2019')

s3.ObjectSummary(bucket_name='sevir', key='data/ir069/2019/SEVIR_IR069_STORMEVENTS_2019_0101_0630.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir069/2019/SEVIR_IR069_STORMEVENTS_2019_0701_1231.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir107/2019/SEVIR_IR107_STORMEVENTS_2019_0101_0630.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/ir107/2019/SEVIR_IR107_STORMEVENTS_2019_0701_1231.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vil/2019/SEVIR_VIL_STORMEVENTS_2019_0101_0630.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vil/2019/SEVIR_VIL_STORMEVENTS_2019_0701_1231.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vis/2019/SEVIR_VIS_STORMEVENTS_2019_0101_0131.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vis/2019/SEVIR_VIS_STORMEVENTS_2019_0201_0228.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vis/2019/SEVIR_VIS_STORMEVENTS_2019_0301_0331.h5')
s3.ObjectSummary(bucket_name='sevir', key='data/vis/2019/SEVIR_VIS_STORMEVENTS_2019_0401_0430

In [23]:
!du -sh ./data

95G	./data


In [24]:
import os
import numpy as np

os.makedirs('./sevir', exist_ok=True)

# Get and save the test samples.
ir069_test_samples = get_event_images(catalog, test_samples_ids, 'ir069', DATA_PATH)
np.save('./sevir/ir069_test_samples.npy', ir069_test_samples)

ir107_test_samples = get_event_images(catalog, test_samples_ids, 'ir107', DATA_PATH)
np.save('./sevir/ir107_test_samples.npy', ir107_test_samples)

#vis_test_samples = get_event_images(catalog, test_samples_ids, 'vis', DATA_PATH)
#np.save('./sevir/vis_test_samples.npy', vis_test_samples)

vil_test_samples = get_event_images(catalog, test_samples_ids, 'vil', DATA_PATH)
np.save('./sevir/vil_test_samples.npy', vil_test_samples)


  return np.array(events_array)


In [25]:
import numpy as np

class Scaler():
    def __init__(self, dataset: np.ndarray) -> None:
        self.min = dataset.min()
        self.max = dataset.max()

    def scale(self, array: np.ndarray) -> np.ndarray:
        return .5 - (array - self.min) / (self.max - self.min)

    def unscale(self, array: np.ndarray) -> np.ndarray:
        return ((.5 - array) * (self.max - self.min)) + self.min

In [26]:
ir_069_scaler = Scaler(ir069_train_samples)
ir_107_scaler = Scaler(ir107_train_samples)
vil_scaler = Scaler(vil_train_samples)

In [27]:
import pickle
import os

os.makedirs('./scaler', exist_ok=True)

# Save the scalers fit on the train data.
with open('./scaler/ir069.pickle', 'wb') as f:
    pickle.dump(ir_069_scaler, f, pickle.HIGHEST_PROTOCOL)

with open('./scaler/ir107.pickle', 'wb') as f:
    pickle.dump(ir_107_scaler, f, pickle.HIGHEST_PROTOCOL)

with open('./scaler/vil.pickle', 'wb') as f:
    pickle.dump(vil_scaler, f, pickle.HIGHEST_PROTOCOL)

In [28]:
import os
import wandb

wandb.login(key='d3d31786e16c5dffe9ee01690d0bb069cac55e84')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [29]:
run = wandb.init(project="sevir", job_type="add-dataset")
artifact = wandb.Artifact(name="SEVIR", type="dataset")

# Add dataset directories to artifact
artifact.add_dir(local_path="./sevir")

# Logs the artifact version
run.log_artifact(artifact)

run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mriccardo-spolaor94[0m ([33mai-industry[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20230917_162557-33hfqozb[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mwoven-sunset-4[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ai-industry/sevir[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ai-industry/sevir/runs/33hfqozb[0m
[34m[1mwandb[0m: Adding directory to artifact (./sevir)... Done. 350.8s
[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 🚀 View run [33mwoven-sunset-4[0m at: [34m[4mhttps://wandb.ai/ai-industry/sevir/runs/33hfqozb[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 9 artifact file(s) and 0 other file(s)
[34m

In [30]:
run = wandb.init(project="sevir", job_type="add-scalers")
artifact = wandb.Artifact(name="scalers", type="pickle")

# Add scaler directory to artifact
artifact.add_dir(local_path="./scaler")

# Logs the artifact version
run.log_artifact(artifact)

run.finish()

[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20230917_163813-35imscqs[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdutiful-field-5[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ai-industry/sevir[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ai-industry/sevir/runs/35imscqs[0m
[34m[1mwandb[0m: Adding directory to artifact (./scaler)... Done. 0.0s
[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 🚀 View run [33mdutiful-field-5[0m at: [34m[4mhttps://wandb.ai/ai-industry/sevir/runs/35imscqs[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 3 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20230917_163813-35imscqs/logs[0m
