# Code pour effectuer les pre-processing sur les données allemandes et les tranformeer en Tensor (TFRecord)

https://github.com/yyyujintang/PostRainBench?tab=readme-ov-file

In [63]:
import os
import gc
import numpy as np
import xarray as xr
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn.utils as utils

In [46]:
load_path = 'data/'
save_path = 'preprocessed/'


### Splits train, test and validation data

In [47]:
set = "01_tst" #test, vld ou trn

In [48]:
time = np.load(load_path + set + '_t.npy')
x = np.load(load_path + set + '_x.npy')
c = x[:,:,:,97]
y = np.load(load_path + set + '_y.npy')
print('x =', x.shape, '---> y =', y.shape)

x = (2671, 36, 36, 143) ---> y = (2671, 72, 72)


In [49]:
## exclude nans
nans = np.isnan(y)
n_nans = np.sum(np.sum(nans, axis=1), axis=1)
mask = n_nans == 0
print('Removed', (np.sum(~mask)/y.shape[0]) * 100 , '% of timesteps' )
x = x[mask]
c = c[mask]
y = y[mask]
time = time[mask]

Removed 0.0 % of timesteps


In [50]:
## Split taking 4 days per month
time = pd.DatetimeIndex(time)
mask_test  = np.isin(time.day, [1, 9, 17, 25])
mask_valid = np.isin(time.day, [5, 13, 21, 28])
mask_train = np.logical_or(mask_test, mask_valid) ## Is it right?
mask_train = np.logical_not(mask_train)
print('Number of common elements from groups =', np.sum(mask_test * mask_valid * mask_train))

Number of common elements from groups = 0


In [51]:
tst_x = x[mask_test]
tst_c = c[mask_test]
tst_y = y[mask_test]
tst_t = time[mask_test]

vld_x = x[mask_valid]
vld_c = c[mask_valid]
vld_y = y[mask_valid]
vld_t = time[mask_valid]

trn_x = x[mask_train]
trn_c = c[mask_train]
trn_y = y[mask_train]
trn_t = time[mask_train]

print('Train: x =', trn_x.shape, '---> y =', trn_y.shape, 'cosmo:', trn_c.shape, 'time:', trn_t.shape)
print('Test: x =', tst_x.shape, '---> y =', tst_y.shape,  'cosmo:', tst_c.shape, 'time:', tst_t.shape)
print('Valid: x =', vld_x.shape, '---> y =', vld_y.shape, 'cosmo:', vld_c.shape, 'time:', vld_t.shape)

Train: x = (0, 36, 36, 143) ---> y = (0, 72, 72) cosmo: (0, 36, 36) time: (0,)
Test: x = (2671, 36, 36, 143) ---> y = (2671, 72, 72) cosmo: (2671, 36, 36) time: (2671,)
Valid: x = (0, 36, 36, 143) ---> y = (0, 72, 72) cosmo: (0, 36, 36) time: (0,)


In [52]:
np.save(save_path + set + '_t.npy', trn_t)


In [53]:
x_mean = np.mean(trn_x, axis=0)
x_std  = np.std(trn_x, axis=0)

trn_x = (trn_x - x_mean)/x_std
tst_x = (tst_x - x_mean)/x_std
vld_x = (vld_x - x_mean)/x_std

print('Train: x =', trn_x.shape, '---> y =', trn_y.shape)
print('Test: x =', tst_x.shape, '---> y =', tst_y.shape)
print('Valid: x =', vld_x.shape, '---> y =', vld_y.shape)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = um.true_divide(


Train: x = (0, 36, 36, 143) ---> y = (0, 72, 72)
Test: x = (2671, 36, 36, 143) ---> y = (2671, 72, 72)
Valid: x = (0, 36, 36, 143) ---> y = (0, 72, 72)


In [54]:
if(set == "01_tst"):
    s_x = tst_x
    s_y = tst_y
    s_c = tst_c
elif(set == "01_trn"):
    s_x = trn_x
    s_y = trn_y
    s_c = trn_c
elif(set == "01_vld"):
    s_x = vld_x
    s_y = vld_y
    s_c = vld_c

np.save(save_path + set + '_x.npy', s_x)
np.save(save_path + set + '_y.npy', s_y)
np.save(save_path + set + '_c.npy', s_c)

In [55]:
del time, x, y, c, trn_x, trn_y, trn_c, tst_x, tst_y, tst_c, vld_x, vld_y, vld_c, s_x, s_y, s_c

# Convolutional approach

## Exec une fois qu'on a fait les pre preprocess precedents pour les 3 datasets (train test val)

In [56]:
#trn_x = np.load(save_path + '01_trn_x.npy')
#trn_y = np.load(save_path + '01_trn_y.npy')
tst_x = np.load(save_path + '01_tst_x.npy')
tst_y = np.load(save_path + '01_tst_y.npy')
vld_x = np.load(save_path + '01_vld_x.npy')
vld_y = np.load(save_path + '01_vld_y.npy')
#print('Train: x =', trn_x.shape, '---> y =', trn_y.shape)
print('Test: x =', tst_x.shape, '---> y =', tst_y.shape)
print('Valid: x =', vld_x.shape, '---> y =', vld_y.shape)

Test: x = (2671, 36, 36, 143) ---> y = (2671, 72, 72)
Valid: x = (2725, 36, 36, 143) ---> y = (2725, 72, 72)


### Write tfrecords

In [60]:
def write_tfrecords(x, y, n_records, name):

    x = np.array_split(x, n_records)
    y = np.array_split(y, n_records)

    ## Write n_records files
    for i, (forecast, prec) in enumerate(zip(x, y)):

        ## Inside each file do:
        with tf.io.TFRecordWriter('tfrecord/'+ name + '/{:03d}'.format(i) +'.tfrecord') as tfrecord:

            for idx in range(2, forecast.shape[0]):


                features = {

                    'feature' : tf.train.Feature(float_list=tf.train.FloatList( value = forecast[idx-2:idx].flatten() )),
                    'label'   : tf.train.Feature(float_list=tf.train.FloatList( value = prec[idx].flatten()     ))}

                example = tf.train.Example(features=tf.train.Features(feature=features))
                tfrecord.write(example.SerializeToString())

        print(name, str(i)+'/'+str(n_records)+' wrote')

In [62]:
#write_tfrecords(trn_x, trn_y, 100, 'train_set')
write_tfrecords(tst_x, tst_y, 10, 'test_set')
write_tfrecords(vld_x, vld_y, 10, 'validation_set')

test_set 0/10 wrote
test_set 1/10 wrote
test_set 2/10 wrote
test_set 3/10 wrote
test_set 4/10 wrote
test_set 5/10 wrote
test_set 6/10 wrote
test_set 7/10 wrote
test_set 8/10 wrote
test_set 9/10 wrote
validation_set 0/10 wrote
validation_set 1/10 wrote
validation_set 2/10 wrote
validation_set 3/10 wrote
validation_set 4/10 wrote
validation_set 5/10 wrote
validation_set 6/10 wrote
validation_set 7/10 wrote
validation_set 8/10 wrote
validation_set 9/10 wrote
