# LSTM Model Type 1

In [1]:
import pandas as pd
import numpy as np
from numpy import array
from collections import defaultdict

import keras_tuner
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import Sequential

from LSTM_Model_Type1_scale import split_sequence, new_split_sequence, split_position, LSTM_FitPredict
from LSTM_Model_Type1_scale import minima_df, maxima_df, scale_df, unscale_prediction

### Read in Raw Data & Preprocessing

In [3]:
# Read in the raw age-sex cohort data without the remainder area
# Prepare the unique sa3 names, codes for LSTM fitting selection
raw_data = pd.read_csv('../Data/true_1000_fulldata.csv')
sa3_num = len(raw_data['SA3 Code'].unique())
sa3_names = raw_data['SA3 Name'].unique()
sa3_codes = raw_data['SA3 Code'].unique()

### Store maxima and minima per area-cohort

In [4]:
minima_dict = minima_df(raw_data, sa3_codes)
maxima_dict = maxima_df(raw_data, sa3_codes)

In [6]:
maxima_dict

defaultdict(dict,
            {10101: {'m0-4': 2605,
              'm5-9': 2592,
              'm10-14': 2532,
              'm15-19': 2472,
              'm20-24': 2183,
              'm25-29': 2392,
              'm30-34': 2483,
              'm35-39': 2476,
              'm40-44': 2494,
              'm45-49': 2577,
              'm50-54': 2555,
              'm55-59': 2407,
              'm60-64': 2422,
              'm65-69': 1950,
              'm70-74': 1483,
              'm75-79': 1010,
              'm80-84': 743,
              'm85+': 490,
              'f0-4': 2400,
              'f5-9': 2495,
              'f10-14': 2428,
              'f15-19': 2277,
              'f20-24': 1830,
              'f25-29': 2152,
              'f30-34': 2339,
              'f35-39': 2403,
              'f40-44': 2426,
              'f45-49': 2474,
              'f50-54': 2418,
              'f55-59': 2314,
              'f60-64': 2230,
              'f65-69': 1907,
              'f70-74': 140

### Rescale data points

In [4]:
rescaled_data = scale_df(raw_data, sa3_codes)
rescaled_data

Unnamed: 0,Year,SA3 Code,SA3 Name,m0-4,m5-9,m10-14,m15-19,m20-24,m25-29,m30-34,...,f45-49,f50-54,f55-59,f60-64,f65-69,f70-74,f75-79,f80-84,f85+,Total
0,1991,10101,Goulburn - Yass,0.996169,0.908475,0.895105,1.000000,0.988713,1.000000,0.993007,...,0.000000,0.000000,0.000000,0.016241,0.000000,0.000000,0.050847,0.000000,0.0,61667
1,1991,10102,Queanbeyan,0.000000,0.000000,0.000000,0.138354,0.675579,0.670455,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,35281
2,1991,10103,Snowy Mountains,1.000000,0.456000,0.000000,0.322404,1.000000,1.000000,1.000000,...,0.000000,0.000000,0.000000,0.020270,0.000000,0.262821,0.000000,0.000000,0.0,18092
3,1991,10104,South Coast,0.934426,0.032258,0.000000,0.003363,0.136029,0.848148,1.000000,...,0.000000,0.000000,0.000000,0.127720,0.356771,0.000000,0.000000,0.000000,0.0,53440
4,1991,10201,Gosford,0.661438,0.034637,0.000000,0.140805,0.000000,1.000000,0.855769,...,0.000000,0.000000,0.000000,0.120136,0.479065,0.248756,0.000000,0.000000,0.0,135708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2011,80105,North Canberra,0.746914,1.000000,0.960784,0.395623,1.000000,1.000000,1.000000,...,0.870130,1.000000,1.000000,0.952830,0.274914,0.000000,0.333333,0.694767,1.0,49910
6821,2011,80106,South Canberra,0.902778,0.616279,0.256881,0.159292,0.616788,1.000000,1.000000,...,0.867692,1.000000,1.000000,1.000000,0.360424,0.160643,0.000000,0.752381,1.0,25176
6822,2011,80107,Tuggeranong,0.082286,0.000557,0.000000,0.644778,0.973705,0.596556,0.000633,...,0.918682,0.985169,1.000000,1.000000,1.000000,0.978495,0.984556,1.000000,1.0,89319
6823,2011,80108,Weston Creek,1.000000,0.566978,0.058824,0.000000,0.000000,0.026316,0.000000,...,0.000000,0.018939,0.396040,1.000000,1.000000,1.000000,1.000000,0.788462,1.0,23354


In [5]:
#Check that scaling is correct

rescaled_data.loc[:,['m0-4','m5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
                              'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
                              'm75-79', 'm80-84', 'm85+','f0-4', 'f5-9', 'f10-14', 'f15-19',
                              'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
                              'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+']].max()

m0-4      1.0
m5-9      1.0
m10-14    1.0
m15-19    1.0
m20-24    1.0
m25-29    1.0
m30-34    1.0
m35-39    1.0
m40-44    1.0
m45-49    1.0
m50-54    1.0
m55-59    1.0
m60-64    1.0
m65-69    1.0
m70-74    1.0
m75-79    1.0
m80-84    1.0
m85+      1.0
f0-4      1.0
f5-9      1.0
f10-14    1.0
f15-19    1.0
f20-24    1.0
f25-29    1.0
f30-34    1.0
f35-39    1.0
f40-44    1.0
f45-49    1.0
f50-54    1.0
f55-59    1.0
f60-64    1.0
f65-69    1.0
f70-74    1.0
f75-79    1.0
f80-84    1.0
f85+      1.0
dtype: float64

In [None]:
rescaled_data.loc[:,['m0-4','m5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
                              'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
                              'm75-79', 'm80-84', 'm85+','f0-4', 'f5-9', 'f10-14', 'f15-19',
                              'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
                              'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+']].min()

In [6]:
#Check that scaling is correct

sample = raw_data[raw_data["SA3 Code"] == 80105]["f85+"]
sample_min = sample.min()
sample_max = sample.max()
rescaled_sample = (sample-sample_min)/(sample_max-sample_min)

rescaled_data[rescaled_data["SA3 Code"] == 80105]["f85+"] - rescaled_sample

320     0.0
645     0.0
970     0.0
1295    0.0
1620    0.0
1945    0.0
2270    0.0
2595    0.0
2920    0.0
3245    0.0
3570    0.0
3895    0.0
4220    0.0
4545    0.0
4870    0.0
5195    0.0
5520    0.0
5845    0.0
6170    0.0
6495    0.0
6820    0.0
Name: f85+, dtype: float64

### Preprocessing by Splitting Population into Sex-Group

In [7]:
# Split the data by sex
age_groups = ['0-4','5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39','40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74','75-79', '80-84', '85+']
population_m_dict = defaultdict(dict)
population_f_dict = defaultdict(dict)

# save population for each year into lists
for sa3_code in sa3_codes:
  population_m_dict[sa3_code] = dict()
  population_f_dict[sa3_code] = dict()
  for year in range(1991,2012):
    if(rescaled_data[(rescaled_data['Year']==year) & (rescaled_data['SA3 Code']==sa3_code)]['Total'].size>0):
      population_m_dict[sa3_code][year] = rescaled_data[(rescaled_data['Year']==year) & (rescaled_data['SA3 Code']==sa3_code)][['m0-4',
                              'm5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
                              'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
                              'm75-79', 'm80-84', 'm85+']].values.tolist()[0]
      population_f_dict[sa3_code][year] = rescaled_data[(rescaled_data['Year']==year) & (rescaled_data['SA3 Code']==sa3_code)][['f0-4', 'f5-9', 'f10-14', 'f15-19',
                              'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
                              'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+']].values.tolist()[0]

In [8]:
male_sample_data = list(population_m_dict[10101].values())
X, y = split_sequence(male_sample_data, 5)

In [9]:
raw_data[(raw_data['Year']==1991) & (raw_data['SA3 Code']==10101)]

Unnamed: 0,Year,SA3 Code,SA3 Name,m0-4,m5-9,m10-14,m15-19,m20-24,m25-29,m30-34,...,f45-49,f50-54,f55-59,f60-64,f65-69,f70-74,f75-79,f80-84,f85+,Total
0,1991,10101,Goulburn - Yass,2603,2565,2517,2472,2178,2392,2478,...,1859,1591,1412,1382,1252,967,799,493,353,61667


In [10]:
unscale_prediction((X[0][0]), 10101, "Males", minima_dict, maxima_dict)

array([2603., 2565., 2517., 2472., 2178., 2392., 2478., 2312., 2296.,
       1999., 1735., 1502., 1513., 1170.,  765.,  506.,  260.,  159.])

### Create Dataframe for Storing Prediction Result

In [11]:
output = pd.DataFrame(index = range(sa3_num*36), columns = ['Code','Area name','Sex','Age group',2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011])
for sa3_code in range(0,sa3_num):
  for age_group in range(18):
    output.loc[sa3_code*36+age_group] = {'Code':sa3_codes[sa3_code],'Area name':sa3_names[sa3_code],'Sex':'Females','Age group':age_groups[age_group]}
    output.loc[sa3_code*36+18+age_group] = {'Code':sa3_codes[sa3_code],'Area name':sa3_names[sa3_code],'Sex':'Males','Age group':age_groups[age_group]}

### Start LSTM Fitting & Predicting

##### Define Variable for Splitting the Full Dataset

In [12]:
# choose a number of time steps
n_steps = 1

# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 18
n_age_groups = 18

# Obtain the split position for the training sex, validation sex, and the first input_x
train_bounds, val_bounds, test_bounds = split_position(n_steps, train_start = 1991, train_end = 2001, fixed_num_val = 2)

##### LSTM Model Defining

In [13]:
'''Function for Defining the LSTM Model'''
def lstm_model(lstm1_units, activ, optimizer, loss_fun):
    model = Sequential()
    model.add(LSTM(lstm1_units, activation=activ, input_shape=(n_steps, n_features)))
    model.add(Dense(n_age_groups))
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=["mean_absolute_percentage_error"])
    return model

##### LSTM Model Tunning

In [14]:
# Define HYPERPARAMETERS
lstm1_min = 100
lstm1_max = 1000
lstm1_step = 100
activ_functions = ["relu"]
optimizers = ["adam","adagrad"]
loss_functions = ["mse"]

'''In-Build Function for LSTM Model Tuning'''
def build_model(hp):
    lstm1_units = hp.Int("LSTM units", min_value=lstm1_min,max_value=lstm1_max,step=lstm1_step)
    activation = hp.Choice("Activation Function", activ_functions)
    optimizer = hp.Choice("Optimizer", optimizers)
    loss_fun = hp.Choice("Loss Function", loss_functions)
    model = lstm_model(lstm1_units, activation, optimizer, loss_fun)
    return model

# Generate the tuned model with keras_tuner package
tuner = keras_tuner.RandomSearch(hypermodel=build_model, objective=keras_tuner.Objective("val_mean_absolute_percentage_error", "min"))

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json


##### Generate Prediction Result

In [None]:
# Generate each sex's Prediction Result of each age-cohort for all areas and store them into csv
output = LSTM_FitPredict(sa3_codes, population_m_dict, n_steps, train_bounds, val_bounds, test_bounds, n_features, 1000, "Males", 2002, tuner, output, minima_dict, maxima_dict)
output = LSTM_FitPredict(sa3_codes, population_f_dict, n_steps, train_bounds, val_bounds, test_bounds, n_features, 1000, "Females", 2002, tuner, output, minima_dict, maxima_dict)
output.to_csv('Iterative_predict_output_Step1_1000_nonneg.csv',index=False,header=True)

INFO:tensorflow:Oracle triggered exit
Area code:  10101
Year:  2002
Scaled prediction:  [[0.2801685  1.0771079  0.68347996 0.         0.31643033 0.08487374
  0.48968178 0.5497272  0.84839225 0.38576773 0.71732056 0.49519825
  0.15942056 0.13401572 0.5355805  0.23839521 0.16949674 0.24399735]]
Unscaled prediction:  [[2229.24795884 2614.74683237 2486.73763496 2129.         1880.17863631
  1752.3267414  2118.12247294 2336.86570525 2456.09806204 2221.97374684
  2323.20286036 1950.15441608 1580.57998534 1274.53226477 1149.54681039
   626.1511879   341.86692777  239.76312296]]
Area code:  10101
Year:  2003
Scaled prediction:  [[0.3741965  1.0706202  0.4275305  0.00400493 0.27373067 0.26564673
  0.5291519  0.7469282  0.60675645 0.46267873 0.6087022  0.4265636
  0.100113   0.20507926 0.46821362 0.28033388 0.16662109 0.2016336 ]]
Unscaled prediction:  [[2278.33057278 2612.83295286 2450.13686112 2130.37369008 1861.26268479
  1878.6870614  2146.3436203  2397.80081844 2395.68911219 2266.42830622
 