# Data Prep

This script splits the meteorological data into four training and four validation data sets:
    1. Single frame
    2. Extreme Frames
    3. Middle Frames
These will each be used for single frame, late fusion, early & slow fusion respectively.

The split between training and validation will be made using time periods. Validation years are as follows:
1. 1993
2. 1996
3. 1999
4. 2002
5. 2005
6. 2008
7. 2011
8. 2014

This is to encourage a range of training and validation data across the time frame of this study.

In [1]:
from tqdm.notebook import tqdm
import numpy as np
import os

## 1. Load Data

To begin, the meteorological data is loaded from preextracted files into training and validation lists. These lists are then used to create training sets for each set highlighted in the introduction.

In [2]:
training_years = [1994, 1995, 1996, 1998, 1999, 2000, 2002, 2003, 2004, 2006, 2007, 2008, 2010, 2011, 2012, 2014, 2015]
validation_years = [1997, 2001, 2005, 2009, 2013]

data_folder = "E:/31-12-2020/forecastee-data/"
rainfall_file = "./data/rainfall/truth_rf.npy"

In [3]:
def load_data(years, data_folder, rainfall_file):
    """ This method loads the meteorology (mean sea level pressure and 2m Air temperature) and rainfall
        for each year provided. For each month of that year the MSLP and 2m Air Temperature are combined into a single
        matrix of size [2, time, 61, 121] and a 2D array of rainfall values for each month in the format
        [month, year, region_0_rainfall, ..., region_12_rainfall].
        Parameters:
            years (list<int>): The years to be extracted for.
            data_folder (string): Where is the meteorological data stored?
            rainfall_file (string): Where is rainfall stored?
        Returns:
            List<Numpy Matrix>: List of monthly matrices of size [2, time, 61, 121].
            Numpy Matrix: CEH-GEAR Rainfall values for each month required, in the format: 
                            [Month, Year, rain_region_0, ..., rain_region_12]
            Numpy Matrix: Met Office Rainfall values for each month required, in the format: 
                            [Month, Year, rain_region_0, ..., rain_region_12]"""
    monthly_meteo = []
    monthly_rain = []
    monthly_mo_rain = []
    rainfall = np.load("./data/rainfall/truth_rf.npy")
    mo_rainfall = np.load("./data/rainfall/mo_rf.npy")
    for y in tqdm(years):
        for m in range(1, 13):
            month_data = []
            try:
                for v in ['msl', 't2m']:
                    data_file = data_folder + "{}/forecasted-months/{}-{}.npy".format(v, m, y)
                    data = np.load(data_file)
                    if len(data.shape) != 3:
                        data = data[0, :, :, :]
                    month_data.append(data)
                # Get rainfall values
                mrain = rainfall[(rainfall[:, 0] == m) & (rainfall[:, 1] == y), :]
                morain = mo_rainfall[(mo_rainfall[:, 0] == m) & (mo_rainfall[:, 1] == y), :]
            except Exception as e:
                print("Unable to load {}/{}-{}".format(v, m, y))
            else:
                monthly_meteo.append(np.array(month_data))
                monthly_rain.append(mrain)
                monthly_mo_rain.append(morain)
    return monthly_meteo, np.squeeze(monthly_rain), np.squeeze(monthly_mo_rain), 

In [5]:
training_meteo_raw, training_rainfall, training_mo_rainfall = load_data(training_years, data_folder, rainfall_file)
validation_meteo_raw, validation_rainfall, validation_mo_rainfall = load_data(validation_years, data_folder, rainfall_file)
all_meteo_raw, all_rainfall, all_mo_rainfall = load_data(range(1994, 2016), data_folder, rainfall_file)

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))




## 2. Preparation Methods

The methods defined below split a given list of numpy matrices into a regular-sized training matrix. After extracting each data set they are saved for use later. Firstly, we define a folder to hold the resulting data sets:

In [6]:
prepared_data_folder = "D:/PHD_DATA/Video_01-03-2021/prepared-data/"

Next, the mean and standard deviation profiles are extracted for both MSLP and 2AT. (0, 61, 121) = means, (1, 61, 121) = stds.

In [8]:
def combine_all(meteo_data):
    combined_patterns = None
    for md in tqdm(meteo_data):
        if combined_patterns is None:
            combined_patterns = md
        else:
            combined_patterns = np.concatenate((combined_patterns, md), axis=1)
    return combined_patterns

def get_stats(meteo_data, output_folder="D:/PHD_DATA/Video_01-03-2021/prepared-data/"):
    if not os.path.exists(output_folder + "stats.npy"):
        combined_patterns = combine_all(meteo_data)
        means, stds = np.mean(combined_patterns, axis=1), np.std(combined_patterns, axis=1)
        np.save(output_folder + "stats.npy", np.array([means, stds]))
    return np.load(output_folder + "stats.npy")
    
stats = get_stats(all_meteo_raw)

HBox(children=(FloatProgress(value=0.0, max=264.0), HTML(value='')))




### 2.1 Single Frame

This first method averages across all days in each month to provide an average forecast MSLP and 2AT. The validation and training sets are then saved under the names defined below.

In [9]:
training_file = prepared_data_folder + "single_train.npy"
validation_file = prepared_data_folder + "single_valid.npy"
all_file = prepared_data_folder + "single_all.npy"

In [10]:
def single_frame(monthly_data, stats):
    """ Averages across each matrix in the time dimension to produce a new
        matrix such that all matrices in the list are of equal size.
        Parameters:
            - monthly_data List<Numpy Matrix>: The matrices, each should have a size of [2, time, 61, 121].
            - stats Numpy Matrix (2, 2, 61, 121): The MSLP & 2AT means (0), and standard deviations (1) of each pixel.
        Returns:
            Numpy Matrix:   A matrix containing all aggregated data from the input through taking the mean of
                            the time dimension. Size: [no. months, 2, 61, 121]"""
    composite_matrices = []
    for m in monthly_data:
        data = np.mean(m, axis=1)
        composite_matrices.append(((data - stats[0, :, :, :]) / stats[1, :, :, :]))
    return composite_matrices

In [11]:
training_single = single_frame(training_meteo_raw, stats)
validation_single = single_frame(validation_meteo_raw, stats)
all_single = single_frame(all_meteo_raw, stats)

In [12]:
np.save(training_file, training_single)
np.save(validation_file, validation_single)
np.save(all_file, all_single)

### 2.2 Middle Frames

This final method takes the middle 28 days of data and combines them into a single matrix. 28 days is chosen because this is the minimum number of days in a month. These are then also saved as separate datasets.

In [13]:
training_file = prepared_data_folder + "middle_train.npy"
validation_file = prepared_data_folder + "middle_valid.npy"
all_file = prepared_data_folder + "middle_all.npy"

In [14]:
def middle_frames(monthly_data, stats):
    """ Takes the middle 28 entries across the time dimension in each matrix to produce a new
        matrix such that all matrices in the list are of equal size.
        Parameters:
            - monthly_data List<Numpy Matrix>: The matrices, each should have a size of [2, time, 61, 121].
            - stats Numpy Matrix (2, 2, 61, 121): The MSLP & 2AT means (0), and standard deviations (1) of each pixel.
        Returns:
            Numpy Matrix:   A matrix containing all aggregated data from the input through taking the mean of
                            the time dimension. Size: [no. months, 2, 56, 61, 121]"""
    matrices = []
    for m in monthly_data:
        start_index = m.shape[1] - 28
        stand_matrix = m[:, start_index:start_index+28, :, :]
        for n in range(0, 28):
            stand_matrix[:, n, :, :] = ((stand_matrix[:, n, :, :] - stats[0, :, :, :]) / stats[1, :, :, :])
        matrices.append(stand_matrix)
    return np.array(matrices)

In [15]:
training_middle = middle_frames(training_meteo_raw, stats)
validation_middle = middle_frames(validation_meteo_raw, stats)
all_middle = middle_frames(all_meteo_raw, stats)

In [16]:
np.save(training_file, training_middle)
np.save(validation_file, validation_middle)
np.save(all_file, all_middle)

## 2.4 Rainfall

Now, save the rainfall values in training and validation files.

In [17]:
training_file = prepared_data_folder + "expected_train.npy"
validation_file = prepared_data_folder + "expected_valid.npy"
all_file = prepared_data_folder + "expected_all.npy"

In [18]:
np.save(training_file, training_rainfall)
np.save(validation_file, validation_rainfall)
np.save(all_file, all_rainfall)

In [19]:
training_file = prepared_data_folder + "mo_train.npy"
validation_file = prepared_data_folder + "mo_valid.npy"
all_file = prepared_data_folder + "mo_all.npy"

In [20]:
np.save(training_file, training_mo_rainfall)
np.save(validation_file, validation_mo_rainfall)
np.save(all_file, all_mo_rainfall)

Next we standardize the rainfall based on region.

In [22]:
def get_stats(rainfall, output_folder="D:/PHD_DATA/Video_25-02-2021/prepared-data/"):
    if not os.path.exists(output_folder + "rainfall_stats.npy"):
        mins = np.min(rainfall[:, 2:], axis=0)
        maxs = np.max(rainfall[:, 2:], axis=0)
        np.save(output_folder + "rainfall_stats.npy", np.array([mins, maxs]).T)
    return np.load(output_folder + "rainfall_stats.npy")

rainfall_stats = get_stats(all_rainfall)

In [23]:
def scale_regionally(rainfall, stats):
    rainfall = rainfall.T[2:, :]
    mins = np.repeat(stats[:, 0], rainfall.shape[1]).reshape((13, rainfall.shape[1]))
    maxs = np.repeat(stats[:, 1], rainfall.shape[1]).reshape((13, rainfall.shape[1]))
    rainfall = (rainfall - mins) / (maxs-mins)
    return rainfall

In [24]:
training_file = prepared_data_folder + "expected_train_standardized.npy"
validation_file = prepared_data_folder + "expected_valid_standardized.npy"
all_file = prepared_data_folder + "expected_all_standardized.npy"

In [25]:
np.save(training_file, scale_regionally(training_rainfall, rainfall_stats))
np.save(validation_file, scale_regionally(validation_rainfall, rainfall_stats))
np.save(all_file, scale_regionally(all_rainfall, rainfall_stats))

In [26]:
training_file = prepared_data_folder + "mo_train_standardized.npy"
validation_file = prepared_data_folder + "mo_valid_standardized.npy"
all_file = prepared_data_folder + "mo_all_standardized.npy"

In [27]:
np.save(training_file, scale_regionally(training_mo_rainfall, rainfall_stats))
np.save(validation_file, scale_regionally(validation_mo_rainfall, rainfall_stats))
np.save(all_file, scale_regionally(all_mo_rainfall, rainfall_stats))