In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
meterology = pd.read_excel("meteorlogy_daily.xlsx")
surface_temp = pd.read_csv("Surface_Temperature_without_remove.csv", usecols=['Date', 'Sup.', 'Huron'], 
                           dtype= {'Sup.':float, 'Huron':float})
ice_data = pd.read_excel("Ice_Data_without_remove.xlsx")
climate_indices = pd.read_csv("climate_ind_daily.csv")

In [4]:
# lag_time (unit: daily)
look_back = 3
predict_interval = 3
# number of features
num_features = 12

## Input Data

In [5]:
def prepare_input_data(look_back, predict_interval, meterology, surface_temp, ice_data):
    merged_df = meterology
    merged_df['Sup.'] = 0.0
    merged_df['Huron'] = 0.0
    merged_df['NAO'] = 0.0
    merged_df['PNA'] = 0.0
    merged_df['EPNP'] = 0.0
    merged_df['SOI'] = 0.0
    merged_df['ice'] = 0.0
    merged_df_date = merged_df['date']
    merged_df.drop(['date'], inplace = True, axis = 1)


    # insert water surface temperature data into the dataframe
    for i in range(len(merged_df)):
        merged_df['Sup.'][i] = surface_temp['Sup.'][i] 
        merged_df['Huron'][i] = surface_temp['Huron'][i] 
        merged_df['NAO'][i] = climate_indices['NAO'][i]
        merged_df['PNA'][i] = climate_indices['PNA'][i]
        merged_df['EPNP'][i] = climate_indices['EPNP'][i]
        merged_df['SOI'][i] = climate_indices['SOI'][i]
        merged_df['ice'][i] = ice_data['Ice'][i]
    
    frames = [merged_df_date]
    # store original column name
    columns_name = merged_df.columns
    
    # Remove the last several records
    for i in range(look_back):
        # restore the original column name
        merged_df.columns = columns_name
        # rename the column
        for index, col in merged_df.iteritems():
            merged_df.rename(columns={index: index + "_" + str(look_back-i-1)}, inplace=True)
        frames.append(merged_df.shift(-i))

    # linear
    train_df = pd.concat(frames, axis=1)
    train_df.drop(index = train_df.tail(look_back).index, inplace = True)
    
    
    return train_df

train_df = prepare_input_data(look_back, predict_interval, meterology, surface_temp, ice_data)
train_df.head()

Unnamed: 0,date,u-wind_2,v-wind_2,pressure_2,humidity_2,temperature_2,Sup._2,Huron_2,NAO_2,PNA_2,...,pressure_0,humidity_0,temperature_0,Sup._0,Huron_0,NAO_0,PNA_0,EPNP_0,SOI_0,ice_0
0,1995-01-01,2.38125,-0.628125,98903.125,30.375,270.178146,3.29,4.5,0.93,0.66,...,98652.5,17.125,265.636627,3.27,4.39,0.93,0.66,1.03,-0.4,0.0
1,1995-01-02,4.033125,-1.283125,98336.875,25.5,267.7145,3.28,4.45,0.93,0.66,...,99073.125,7.375,259.524513,3.27,4.33,0.93,0.66,1.03,-0.4,0.0
2,1995-01-03,5.93875,0.4825,98652.5,17.125,265.636627,3.27,4.39,0.93,0.66,...,98769.375,16.625,264.535141,3.22,4.18,0.93,0.66,1.03,-0.4,0.0
3,1995-01-04,3.7225,-1.425625,99073.125,7.375,259.524513,3.27,4.33,0.93,0.66,...,97803.125,36.5,269.011017,3.19,4.06,0.93,0.66,1.03,-0.4,0.0
4,1995-01-05,7.3925,2.583125,98769.375,16.625,264.535141,3.22,4.18,0.93,0.66,...,98220.0,8.0,262.210144,3.14,3.9,0.93,0.66,1.03,-0.4,0.0


## output data

In [6]:
ice_data = ice_data.drop(list(range(0,look_back + predict_interval - 1)))
ice_data.head()

Unnamed: 0,date,Ice
5,1995-01-06,0.0
6,1995-01-07,0.0
7,1995-01-08,0.0
8,1995-01-09,62.446242
9,1995-01-10,77.650264


## Normalization

In [20]:
# normalize the input data
# minus and plus 1

scaler_train = MinMaxScaler(feature_range=(0,1))
input_columns = train_df.iloc[:,1:].columns
standard_array = scaler_train.fit_transform(train_df.iloc[:,1:].values) 
train_df_norm = pd.DataFrame(standard_array)
train_df_norm.columns = input_columns

# reset the index for the date dataframe
train_df_norm = pd.concat([train_df.iloc[:,0].to_frame().reset_index(drop=True), train_df_norm], axis = 1)
train_df_norm.head()

Unnamed: 0,date,u-wind,v-wind,pressure,humidity,temperature,Sup.,Huron,NAO,PNA,...,pressure.1,humidity.1,temperature.1,Sup..1,Huron.1,NAO.1,PNA.1,EPNP,SOI,ice
0,1995-01-01,0.615115,0.473178,0.5167,0.384348,0.417669,0.153884,0.190603,0.718531,0.650442,...,0.469706,0.2,0.334305,0.152888,0.185727,0.718531,0.650442,0.608403,0.45,0.0
1,1995-01-02,0.711702,0.427545,0.410524,0.316522,0.372446,0.153386,0.188387,0.718531,0.650442,...,0.548576,0.064348,0.22211,0.152888,0.183067,0.718531,0.650442,0.608403,0.45,0.0
2,1995-01-03,0.823125,0.550553,0.469706,0.2,0.334305,0.152888,0.185727,0.718531,0.650442,...,0.491621,0.193043,0.314086,0.150398,0.176418,0.718531,0.650442,0.608403,0.45,0.0
3,1995-01-04,0.693539,0.417617,0.548576,0.064348,0.22211,0.152888,0.183067,0.718531,0.650442,...,0.310442,0.469565,0.396245,0.148904,0.171099,0.718531,0.650442,0.608403,0.45,0.0
4,1995-01-05,0.908127,0.6969,0.491621,0.193043,0.314086,0.150398,0.176418,0.718531,0.650442,...,0.388609,0.073043,0.271408,0.146414,0.164007,0.718531,0.650442,0.608403,0.45,0.0


In [21]:
# normalize the output data

scaler_test = MinMaxScaler(feature_range=(0,1))
input_columns = ice_data.iloc[:,1:].columns
standard_array = scaler_test.fit_transform(ice_data.iloc[:,1:].values) 
test_df = pd.DataFrame(standard_array)
test_df.columns = input_columns

test_df_norm = pd.concat([ice_data.iloc[:,0].to_frame().reset_index(drop=True), test_df], axis = 1)
test_df_norm.head()

Unnamed: 0,date,Ice
0,1995-01-06,0.0
1,1995-01-07,0.0
2,1995-01-08,0.0
3,1995-01-09,0.624472
4,1995-01-10,0.776515
