In [5]:
from numpy import array
from numpy import split
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# def split_data_to_x_y(data, input_length, output_length=20, only_window=True, forecast_column_index=4):  # older version
#     """
#     Takes in a the data that has been passed through chunk_data. Reshapes it back into a numpy matrix,
#     Then starts to chunk the data again. It will grab an inputted amount of lagged values that are to be used in
#     the lstm. Will do the same with the forecasted value. Then return those values so that they are ready to be
#     used by the lstm.
#
#     :param forecast_column_index: index column of the what uni variate value I want to be forecasted
#     :param data: my input data, will have passed through chunk_data and be in numpy arrays
#     :param input_length: amount of lagged values that I want to use with each timestep
#     :param output_length:
#     :param only_window: controller on the type of output. If true is is univariate. If false still
#     univariate but now also forecasts a series of timesteps
#     :return: the x_train an y_train, info and forecast for the lstm
#     """
#     # put data back into continuous matrix
#     data = data.reshape((data.shape[0] * data.shape[1], data.shape[2]))
#     x_train = []
#     y_train = []
#     start_point = 0  # start gathering training data at the beginning
#
#     # move over data one time step at a time
#     for _ in range(len(data)):
#         input_data_end = start_point + input_length
#         # output will currently cover timestep forecasts t+1 to t+20
#         output_data_end = input_data_end + output_length
#
#         if output_data_end <= len(data):
#             x_train.append(data[start_point:input_data_end, 0:-1])
#             if only_window:
#                 # arrays are set to get the info into [samples, timestep, feature]
#                 y_train.append([[data[input_data_end, -1]]])
#             else:
#                 y_train.append(data[input_data_end:output_data_end, forecast_column_index])
#
#         start_point += 1  # move ahead one time step
#
#     return array(x_train), array(y_train)

def normalize_data(df):
    """
    Normalizes the data to number between [-1, 1] inclusive.

    :param df: data to normalize
    :return: the newly scaled data
    """
    x_train = df.values
    scaler = MinMaxScaler()
    x_scaled = scaler.fit_transform(x_train)
    return pd.DataFrame(x_scaled, columns=df.columns)


# def chunk_data(data, chunk_size):  # older version
#     """
#     function chunks the data into several equally sized data chunks, all of them meant to
#     be inputted into the LSTM as such. This is a point to messing around to test performance
#     to see if smaller or larger chunk sizes get better performance.
#
#     :param data: data input, mean to be numpy array from dataFrame.values
#     :param chunk_size: number of samples to take from the data given
#     :return: chunked data
#     """
#     balancer = data.shape[0] % chunk_size
#     if balancer == 0:
#         return array(split(data, len(data)/chunk_size))
#     else:
#         # needed since all arrays must be balanced for number of features
#         # This removes the extra features that would result in an uneven balance on the last array
#         # only removes the final elements
#         removal_list = []
#         for i in range(balancer, 0, -1):
#             removal_list.append(data.shape[0] - i)
#
#         data = np.delete(data, removal_list, axis=0)
#         return array(split(data, len(data) / chunk_size))

def chunk_and_split_data_to_x_y_one_step_drop_off(df, chunk_size, expected_cols):
    """
    Similar to chunk_and_split_data_to_x_y but instead of new data sections for each step. Each new
    data matrix only moves one row ahead.

    Takes in a dataframe whose data you want to use to train a model with. Removes the expected data output from the training dataset.
    Sets up the expected data in the correct format and sets up the training data in the correct format
    Mainly used to organize data for training with a neural network such as keras

    :param df: A dataframe
    :param chunk_size: The size of the 2D matrix that you want to use to train the model. Can handle any non-negative int lower in size than the input dataframe
    :param expected_cols: The expected output columns that are the correct answer for the model. Aka your y
    :return: training data and expected data
    """
    balance_num = df.shape[0] % chunk_size
    expected = df.loc[:, expected_cols]
    expected = expected[chunk_size + balance_num:].values  # drops the balance value and drops the first chunk size number of values
    expected = expected.reshape(expected.shape[0], 1)
    df = df.drop(columns=expected_cols)
    if balance_num == 0:
        training_data = df.values
        data_out = []
        for i in range(training_data.shape[0] - chunk_size):  # drops the last chunk size of values. Since otherwise we would be operating out of the data range.
            data_out.append(training_data[i:i + chunk_size])
    else:
        df = df.drop(df.index[range(balance_num)])
        training_data = df.values
        data_out = []
        for i in range(training_data.shape[0] - chunk_size):  # drops the last chunk size of values. Since otherwise we would be operating out of the data range.
            data_out.append(training_data[i:i + chunk_size])

    df = df.drop(df.index[range(chunk_size + balance_num)])

    return np.array(data_out), expected, df


def chunk_and_split_data_to_x_y(df, chunk_size, expected_cols):
    """
    Built to replace both chunk_data and split_data_to_x_y. Does the exact same job but now with a lot less steps.

    Built to separate out chunks of data. Each data chunk will be of dataframe columns and chunk_sze 2D matrix. The row values will not
    repeat. So each data chunk contains new values. If you want repeating one step moving 2D matrixes use chunk_and_split_data_to_x_y_one_step_drop_off

    Takes in a dataframe whose data you want to use to train a model with. Removes the expected data output from the training dataset.
    Sets up the expected data in the correct format and sets up the training data in the correct format
    Mainly used to organize data for training with a neural network such as keras

    :param df: A dataframe
    :param chunk_size: The size of the 2D matrix that you want to use to train the model. Can handle any non-negative int lower in size than the input dataframe
    :param expected_cols: The expected output columns that are the correct answer for the model. Aka your y
    :return: training data and expected data
    """
    balance_num = df.shape[0] % chunk_size
    expected = df.loc[:, expected_cols]
    expected = expected[chunk_size + balance_num:].values
    expected_out = []
    for i in range(0, expected.shape[0], chunk_size):
        expected_out.append(i)
    expected_out = np.array(expected_out)
    expected_out = expected_out.reshape(expected_out.shape[0], 1)

    df = df.drop(columns=expected_cols)
    df = df.drop(df.index[range((chunk_size * -1), 0)])  # drops the last chunk size of values. Since otherwise we would be operating out of the data range.
    if balance_num == 0:
        training_data = df.values
        training_data = np.array(np.split(training_data, len(training_data)/chunk_size))
    else:
        df = df.drop(df.index[range(balance_num)])
        training_data = df.values
        training_data = np.array(np.split(training_data, len(training_data)/chunk_size))

    return training_data, expected_out


# Small example of using the above code

In [6]:
df = pd.read_csv("/home/nelson/PycharmProjects/my_notes/Dummy_data_for_examples/flagstaff.csv", index_col="date_time", parse_dates=True)
df

Unnamed: 0_level_0,altimeter_set_1,wind_cardinal_direction_set_1d,wind_gust_set_1,cloud_layer_2_code_set_1,cloud_layer_3_code_set_1,wind_chill_set_1d,weather_summary_set_1d,heat_index_set_1d,sea_level_pressure_set_1,sea_level_pressure_set_1d,...,air_temp_set_1,air_temp_high_6_hour_set_1,dew_point_temperature_set_1d,ceiling_set_1,dew_point_temperature_set_1,air_temp_low_24_hour_set_1,precip_accum_six_hour_set_1,metar_origin_set_1,weather_cond_code_set_1,wind_direction_set_1
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01 00:10:00+00:00,78191.09,SW,11.32,212.0,0.0,-3.99,scattered,0.0,0.0,78112.91,...,2.0,0.0,-1.05,0.0,0.0,0.0,0.0,0.0,0.0,220.0
2005-01-01 00:20:00+00:00,78191.09,SSW,11.32,223.0,0.0,-3.81,broken,0.0,0.0,78112.91,...,2.0,0.0,-1.05,0.0,0.0,0.0,0.0,0.0,0.0,210.0
2005-01-01 00:55:00+00:00,78191.09,SW,10.29,853.0,0.0,-3.81,broken,0.0,101220.0,78134.39,...,1.7,0.0,-1.14,0.0,0.0,0.0,0.0,0.0,0.0,220.0
2005-01-01 01:45:00+00:00,78191.09,SSW,10.29,203.0,803.0,-3.62,broken,0.0,0.0,78112.91,...,2.0,0.0,-1.05,0.0,0.0,0.0,0.0,0.0,0.0,210.0
2005-01-01 01:55:00+00:00,78191.09,SW,11.32,222.0,0.0,-3.62,scattered,0.0,101180.0,78112.91,...,2.0,0.0,-1.05,0.0,0.0,0.0,0.0,0.0,0.0,220.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31 19:57:00+00:00,77694.89,WSW,0.00,0.0,0.0,0.00,clear,0.0,101290.0,78513.41,...,-10.0,0.0,-21.84,0.0,0.0,0.0,0.0,0.0,0.0,240.0
2010-12-31 20:57:00+00:00,77694.89,NW,0.00,0.0,0.0,-15.33,clear,0.0,101270.0,78427.57,...,-8.9,0.0,-20.74,0.0,0.0,0.0,0.0,0.0,0.0,310.0
2010-12-31 21:57:00+00:00,77747.12,N,7.20,0.0,0.0,-12.95,clear,0.0,101300.0,78433.78,...,-8.3,0.0,-20.74,0.0,0.0,0.0,0.0,0.0,0.0,350.0
2010-12-31 22:57:00+00:00,77799.35,0,0.00,0.0,0.0,0.00,clear,0.0,101420.0,78533.03,...,-8.9,0.0,-21.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The first step that you need to do is shift the columns of the data that you are using to get the correct setup for forecasting.
So in this case you want to forecast what the next visibility range is going to be in the timeseries.
An easy example since this is just focused on showing what the code can do but the following steps will also work for n future time-steps along with
forecasting whatever number of variable Dataframe columns that you want. Though keep in mind that forecasting is difficult so the more columns that you
are forecasting and the father away the timestep the more likely the results will be terrible.


In [7]:
time_step_shift = 1

df["expected_visibility"] = df["visibility_set_1"].shift(time_step_shift)  # you will have to do this for every single column that you want to forecast
df = df[time_step_shift:]  # drops all values where the expected columns are going to be nan

Keep in mind that in the data above to actually use it for forecasting it would require you to convert the text data into strings and possible normalize some of the
columns before feeding it to a neural network. As such it currently will not work.

In [8]:
train_input_data, train_expected_data = chunk_and_split_data_to_x_y(df, chunk_size=20, expected_cols="expected_visibility")

In [9]:
train_input_data[0]

array([[78191.09, 'SSW', 11.32, 223.0, 0.0, -3.81, 'broken', 0.0, 0.0,
        78112.91, 80.49, 166.0, '0', 0.0, 0.0, 0.0, 0.0, 60339.29, 0.0,
        10.0, 0.0, 8.23, 0.0, 0.0, 2.0, 0.0, -1.05, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 210.0],
       [78191.09, 'SW', 10.29, 853.0, 0.0, -3.81, 'broken', 0.0,
        101220.0, 78134.39, 81.63, 176.0, '0', 0.0, 0.0, 0.0, 0.0,
        60339.29, 0.0, 10.0, 0.0, 7.2, 0.0, 0.0, 1.7, 0.0, -1.14, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 220.0],
       [78191.09, 'SSW', 10.29, 203.0, 803.0, -3.62, 'broken', 0.0, 0.0,
        78112.91, 80.49, 152.0, '0', 0.0, 0.0, 0.0, 0.0, 60339.29, 0.0,
        10.0, 0.0, 7.72, 0.0, 0.0, 2.0, 0.0, -1.05, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 210.0],
       [78191.09, 'SW', 11.32, 222.0, 0.0, -3.62, 'scattered', 0.0,
        101180.0, 78112.91, 80.49, 152.0, '0', 0.0, 0.0, 0.0, 0.0,
        60339.29, 0.0, 10.0, 0.0, 7.72, 0.0, 0.0, 2.0, 0.0, -1.05, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 220.0],
       [78217.2, 'SW', 0

In [10]:
train_expected_data[0]

array([9.])

Final step to feed into network. Just giving the network the parameters that you will need to inform it as to what the input shapes are.

In [None]:
n_timesteps, n_features, n_outputs = train_input_data.shape[1], train_input_data.shape[2], train_expected_data.shape[1]