# Data Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
import os
path_of_this_jupyer_notebook_without_filename=r'/content/gdrive/MyDrive/CEE498_Project'
os.chdir(path_of_this_jupyer_notebook_without_filename)

In [1]:
!unzip "/content/gdrive/MyDrive/CEE498_Project/training.zip" -d "/content";

In [None]:
import numpy as np
import h5py
import os
import torch
import torch.nn as nn
from tqdm import tqdm

### Example of lowering resolution of inputs

In [None]:
# Reduce the number of pixels from 495*436 to 70*72
# resolution from 100m*100m to around 750m * 675m
f = np.array(h5py.File('/content/training/2019-06-06_CHICAGO_8ch.h5','r')['array']) # original shape (288, 495, 436, 8)
f_mov_array = np.moveaxis((f),-1,1) # to shape(288, 8, 495, 436)
print('first move axis',f_mov_array.shape)

f_v = f_mov_array[:,[0,2,4,6],:,:] # index by volume
f_s = f_mov_array[:,[1,3,5,7],:,:] # index by speed

m = nn.MaxPool2d((7,6)) 

# Max volume
f_v = torch.from_numpy(f_v).float() # numpy to torch
output_v = m(f_v) # to shape(288, 4, 70, 72)
print('after max pooling of volume',output_v.shape)

# Max (0,Min(speed))
f_s = torch.from_numpy(f_s).float() # numpy to torch
f_s = f_s.where(f_s!=0, torch.tensor(256.0)) * (-1)
output_s = m(f_s) * (-1) # to shape(288, 4, 70, 72)
output_s = output_s.where(output_s!=256, torch.tensor(0.0))
print('after min pooling of speed',output_s.shape)

output_v = np.moveaxis(output_v.numpy(),1,-1) 
output_s = np.moveaxis(output_s.numpy(),1,-1) 
output_final = np.concatenate((output_v,output_s),axis=-1) # to shape(288, 70, 72, 8)
print('final output shape',output_final.shape)
# # output_final = torch.from_numpy(output_final).float() # numpy to torch

first move axis (288, 8, 495, 436)
after max pooling of volume torch.Size([288, 4, 70, 72])
after min pooling of speed torch.Size([288, 4, 70, 72])
final output shape (288, 70, 72, 8)


In [None]:
# write into h5 file
file = h5py.File('test.h5','w')
file.create_dataset('data', data = output_final)
file.close()

In [None]:
aa = np.array(h5py.File('/content/gdrive/MyDrive/CEE498_Project/test.h5','r')['data'])

---------


# Preprocess the data

## Train data *2019-01-02 -- 2019-05-31*

In [None]:
# extract file name from 2019-01-02
train_file= [sorted(os.listdir(r'/content/training'))[0][:10]] 

# extract matrices from 2019-01-02
f = np.array(h5py.File('/content/training/'+sorted(os.listdir(r'/content/training'))[100],'r')['array']) # 0, 50, 100

f_mov_array = np.moveaxis((f),-1,1) # to shape(288, 8, 495, 436)

f_v = f_mov_array[:,[0,2,4,6],:,:] # index by volume
f_s = f_mov_array[:,[1,3,5,7],:,:] # index by speed

m = nn.MaxPool2d((7, 6)) 

# Max volume
f_v = torch.from_numpy(f_v).float() # numpy to torch
output_v = m(f_v) # to shape(288, 4, 70, 72)

# Max (0,Min(speed))
f_s = torch.from_numpy(f_s).float() # numpy to torch
f_s = f_s.where(f_s!=0, torch.tensor(256.0)) * (-1)
output_s = m(f_s) * (-1) # to shape(288, 4, 70, 72)
output_s = output_s.where(output_s!=256, torch.tensor(0.0))

output_v = np.moveaxis(output_v.numpy(),1,-1) 
output_s = np.moveaxis(output_s.numpy(),1,-1) 
train_data = np.concatenate((output_v,output_s),axis=-1) # to shape(288, 70, 72, 8)

type(train_data),train_data.shape

(numpy.ndarray, (288, 70, 72, 8))

In [None]:
for filename in tqdm(sorted(os.listdir(r'/content/training'))[1:150]): # extract files from 2019-01-03 -- 2019-05-31 [1:150] #1:50, 51:100 101:150
  # extract the date
  train_file.append(filename[:10]) 
  # extract matrice and change the resolution
  f = np.array(h5py.File('/content/training/'+filename,'r')['array'])

  f_mov_array = np.moveaxis((f),-1,1) # to shape(288, 8, 495, 436)

  f_v = f_mov_array[:,[0,2,4,6],:,:] # index by volume
  f_s = f_mov_array[:,[1,3,5,7],:,:] # index by speed

  m = nn.MaxPool2d((7, 6)) 

  # Max volume
  f_v = torch.from_numpy(f_v).float() # numpy to torch
  output_v = m(f_v) # to shape(288, 4, 70, 72)

  # Max (0,Min(speed))
  f_s = torch.from_numpy(f_s).float() # numpy to torch
  f_s = f_s.where(f_s!=0, torch.tensor(256.0)) * (-1) # convert 0 to the smallest number -256
  output_s = m(f_s) * (-1) # to shape(288, 4, 70, 72)
  output_s = output_s.where(output_s!=256, torch.tensor(0.0)) # convert 256 to 0

  output_v = np.moveaxis(output_v.numpy(),1,-1) 
  output_s = np.moveaxis(output_s.numpy(),1,-1) 
  output_final = np.concatenate((output_v,output_s),axis=-1) # to shape(288, 70, 72, 8)
  
  train_data = np.concatenate((train_data,output_final))
  # train_data = torch.from_numpy(train_data).float() # we want to numpy to torch when using, so no need to convert it to torch right now.
train_data.shape

100%|██████████| 49/49 [04:34<00:00,  5.59s/it]


(14400, 70, 72, 8)

In [None]:
# write into h5 file
file = h5py.File('Train_7072_3.h5','w')
file.create_dataset('data', data = train_data)
file.close()

In [None]:
!mv Train_7072_3.h5 gdrive/MyDrive/CEE498_Project/

## Test data *2019-06-01 -- 2019-06-30*

In [None]:
# extract file name from 2019-06-01
test_file= [sorted(os.listdir(r'/content/training'))[150][:10]] 

# extract matrices from 2019-06-01
f = np.array(h5py.File('/content/training/'+sorted(os.listdir(r'/content/training'))[150],'r')['array'])

f_mov_array = np.moveaxis((f),-1,1) # to shape(288, 8, 495, 436)

f_v = f_mov_array[:,[0,2,4,6],:,:] # index by volume
f_s = f_mov_array[:,[1,3,5,7],:,:] # index by speed

m = nn.MaxPool2d((7,6)) 

# Max volume
f_v = torch.from_numpy(f_v).float() # numpy to torch
output_v = m(f_v) # to shape(288, 4, 70, 72)

# Max (0,Min(speed))
f_s = torch.from_numpy(f_s).float() # numpy to torch
f_s = f_s.where(f_s!=0, torch.tensor(256.0)) * (-1)
output_s = m(f_s) * (-1) # to shape(288, 4, 70, 72)
output_s = output_s.where(output_s!=256, torch.tensor(0.0))

output_v = np.moveaxis(output_v.numpy(),1,-1) 
output_s = np.moveaxis(output_s.numpy(),1,-1) 
test_data = np.concatenate((output_v,output_s),axis=-1) # to shape(288, 70, 72, 8)

type(test_data),test_data.shape

(numpy.ndarray, (288, 70, 72, 8))

In [None]:
for filename in tqdm(sorted(os.listdir(r'/content/training'))[151:180]): # extract files from 2019-06-02 -- 2019-06-30 
  # extract the date
  test_file.append(filename[:10]) 
  # extract matrice and change the resolution
  f = np.array(h5py.File('/content/training/'+filename,'r')['array'])

  f_mov_array = np.moveaxis((f),-1,1) # to shape(288, 8, 495, 436)

  f_v = f_mov_array[:,[0,2,4,6],:,:] # index by volume
  f_s = f_mov_array[:,[1,3,5,7],:,:] # index by speed

  m = nn.MaxPool2d((7, 6)) 

  # Max volume
  f_v = torch.from_numpy(f_v).float() # numpy to torch
  output_v = m(f_v) # to shape(288, 4, 70, 72)

  # Max (0,Min(speed))
  f_s = torch.from_numpy(f_s).float() # numpy to torch
  f_s = f_s.where(f_s!=0, torch.tensor(256.0)) * (-1) # convert 0 to the smallest number -256
  output_s = m(f_s) * (-1) # to shape(288, 4, 70, 72)
  output_s = output_s.where(output_s!=256, torch.tensor(0.0)) # convert 256 to 0

  output_v = np.moveaxis(output_v.numpy(),1,-1) 
  output_s = np.moveaxis(output_s.numpy(),1,-1) 
  output_final = np.concatenate((output_v,output_s),axis=-1) # to shape(288, 70, 72, 8)
  
  test_data = np.concatenate((test_data,output_final))
  # train_data = torch.from_numpy(test_data).float() # we want to numpy to torch when using, so no need to convert it to torch right now.
test_data.shape

100%|██████████| 29/29 [02:23<00:00,  4.97s/it]


(8640, 70, 72, 8)

In [None]:
# write into h5 file
file = h5py.File('Test_7072.h5','w')
file.create_dataset('data', data = test_data)
file.close()

In [None]:
!mv Train2.h5 gdrive/MyDrive/CEE498_Project/