# Installing Dependencies

In [1]:
!pip install netCDF4


Collecting netCDF4
  Downloading netCDF4-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading netCDF4-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netCDF4
Successfully installed cftime-1.6.4.post1 netCDF4-1.7.2


# Importing Libraries

In [2]:
import os
import gzip
import netCDF4
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Downloading Lat-Long File From The Cloud


In [9]:
root_dir = "/content/drive/My Drive/Graduate - Academic Work/Semester 3/EEE598 Deep Learning/Project Proposal/Data"
raw_data_dir = root_dir + "/Raw"
processed_data_dir = root_dir + "/Processed"
lat_long_file = root_dir + "/" + "lat_long.gz"


with gzip.open(lat_long_file) as gz:
    grid_dataset = netCDF4.Dataset('dummy', mode='r', memory=gz.read())
    print(grid_dataset.variables)
    lat_grid_raw = grid_dataset['latitude'][:]
    lon_grid_raw = grid_dataset['longitude'][:]

{'latitude': <class 'netCDF4.Variable'>
float32 latitude(lines, elems)
    long_name: latitude of GHE (positive North)
    units: degrees
    parameter_type: GHE rain
    valid_range: [-65.  65.]
    _FillValue: -9999.0
unlimited dimensions: 
current shape = (4800, 10020)
filling on, 'longitude': <class 'netCDF4.Variable'>
float32 longitude(lines, elems)
    long_name: longitude of GHE (positive East)
    units: degrees
    parameter_type: GHE rain
    valid_range: [-180.  180.]
    _FillValue: -9999.0
unlimited dimensions: 
current shape = (4800, 10020)
filling on}


\## Retrieving Samples For a Specific Date

In [10]:
rainy_days = ['2021/07/02', '2021/07/03', '2021/07/04', '2021/07/05', '2021/07/06', '2021/07/07', '2021/07/09', '2021/07/10', '2021/07/11', '2021/07/12', '2021/07/13', '2021/07/14', '2021/07/15', '2021/07/16', '2021/07/17', '2021/07/18', '2021/07/19', '2021/07/20', '2021/07/21', '2021/07/22', '2021/07/23', '2021/07/24', '2021/07/25', '2021/07/26', '2021/07/27', '2021/07/28', '2021/07/29', '2021/07/30', '2021/07/31', '2021/08/01', '2021/08/02', '2021/08/03', '2021/08/04', '2021/08/06', '2021/08/08', '2021/08/09', '2021/08/10', '2021/08/16', '2021/08/19', '2021/08/20', '2021/08/21', '2021/08/22', '2021/08/28', '2021/08/29', '2021/08/30', '2021/08/31', '2021/09/01', '2021/09/02', '2021/09/03', '2021/09/04', '2021/09/05', '2021/09/06', '2021/09/07', '2021/09/08', '2021/09/09', '2021/09/10', '2021/09/11', '2021/09/12', '2021/09/13', '2021/09/14', '2021/09/15', '2021/09/17', '2021/09/18', '2021/09/19', '2021/09/20', '2021/09/21', '2021/09/22', '2021/09/23', '2021/09/24', '2021/09/25', '2021/09/26', '2021/09/27', '2021/09/28', '2021/09/29', '2021/09/30']

# Defining region of interest
min_lat, max_lat = 23.65101718199114, 28.571809429628
min_lon, max_lon = 66.40570152424772, 71.273030867477
lat_grid_raw = np.array(lat_grid_raw)
lon_grid_raw = np.array(lon_grid_raw)

# Find the indices for the latitude and longitude bounds as in the previous code snippet
lat_indices = np.where((lat_grid_raw[:, 0] >= min_lat) & (lat_grid_raw[:, 0] <= max_lat))[0]
lon_indices = np.where((lon_grid_raw[0, :] >= min_lon) & (lon_grid_raw[0, :] <= max_lon))[0]

for date in rainy_days:
    print('Processing: ' + date)
    dir = raw_data_dir + "/" + date
    year = date.split('/')[0]
    for filename in os.listdir(dir):
      file_path = dir + "/" + filename

      with gzip.open(file_path) as gz:
        dataset = netCDF4.Dataset('dummy', mode='r', memory=gz.read())
        precipitation = np.array(dataset['rain'][:])
        precipitation[precipitation < 0] = 0  # Remove fill values
        precipitation_subset = precipitation[lat_indices.min():lat_indices.max()+1, lon_indices.min():lon_indices.max()+1]
        os.makedirs(processed_data_dir + f'/{year}/', exist_ok=True)
        np.save(processed_data_dir + f'/{year}/' + filename + '.npy', precipitation_subset)
        dataset.close()







Processing: 2021/07/02
Processing: 2021/07/03
Processing: 2021/07/04
Processing: 2021/07/05
Processing: 2021/07/06
Processing: 2021/07/07
Processing: 2021/07/09
Processing: 2021/07/10
Processing: 2021/07/11
Processing: 2021/07/12
Processing: 2021/07/13
Processing: 2021/07/14
Processing: 2021/07/15
Processing: 2021/07/16
Processing: 2021/07/17
Processing: 2021/07/18
Processing: 2021/07/19
Processing: 2021/07/20
Processing: 2021/07/21
Processing: 2021/07/22
Processing: 2021/07/23
Processing: 2021/07/24
Processing: 2021/07/25
Processing: 2021/07/26
Processing: 2021/07/27
Processing: 2021/07/28
Processing: 2021/07/29
Processing: 2021/07/30
Processing: 2021/07/31
Processing: 2021/08/01
Processing: 2021/08/02
Processing: 2021/08/03
Processing: 2021/08/04
Processing: 2021/08/06
Processing: 2021/08/08
Processing: 2021/08/09
Processing: 2021/08/10
Processing: 2021/08/16
Processing: 2021/08/19
Processing: 2021/08/20
Processing: 2021/08/21
Processing: 2021/08/22
Processing: 2021/08/28
Processing: