# LSTM Model Type 2

In [1]:
import pandas as pd
import numpy as np
from numpy import array
from collections import defaultdict
!pip install keras-tuner
import keras_tuner
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import Sequential

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from lstmtype2 import split_sequence, new_split_sequence, split_position, LSTM_FitPredict
from lstmtype2 import minima_df, maxima_df, scale_df, unscale_prediction

### Read in Raw Data & Preprocessing

In [3]:
# Read in the raw age-sex cohort data without the remainder area
# Prepare the unique sa3 names, codes for LSTM fitting selection
raw_data = pd.read_csv('true_1000_fulldata.csv')
sa3_num = len(raw_data['SA3 Code'].unique())
sa3_names = raw_data['SA3 Name'].unique()
sa3_codes = raw_data['SA3 Code'].unique()

### Store maxima and minima per area-cohort

In [10]:
minima_dict = minima_df(raw_data, sa3_codes)
maxima_dict = maxima_df(raw_data, sa3_codes)

### Rescale data points

In [11]:
rescaled_data = scale_df(raw_data, sa3_codes)
rescaled_data

Unnamed: 0,Year,SA3 Code,SA3 Name,m0-4,m5-9,m10-14,m15-19,m20-24,m25-29,m30-34,...,f45-49,f50-54,f55-59,f60-64,f65-69,f70-74,f75-79,f80-84,f85+,Total
0,1991,10101,Goulburn - Yass,0.996169,0.908475,0.895105,1.000000,0.988713,1.000000,0.993007,...,0.000000,0.000000,0.000000,0.016241,0.000000,0.000000,0.050847,0.000000,0.0,61667
1,1991,10102,Queanbeyan,0.000000,0.000000,0.000000,0.138354,0.675579,0.670455,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,35281
2,1991,10103,Snowy Mountains,1.000000,0.456000,0.000000,0.322404,1.000000,1.000000,1.000000,...,0.000000,0.000000,0.000000,0.020270,0.000000,0.262821,0.000000,0.000000,0.0,18092
3,1991,10104,South Coast,0.934426,0.032258,0.000000,0.003363,0.136029,0.848148,1.000000,...,0.000000,0.000000,0.000000,0.127720,0.356771,0.000000,0.000000,0.000000,0.0,53440
4,1991,10201,Gosford,0.661438,0.034637,0.000000,0.140805,0.000000,1.000000,0.855769,...,0.000000,0.000000,0.000000,0.120136,0.479065,0.248756,0.000000,0.000000,0.0,135708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2011,80105,North Canberra,0.746914,1.000000,0.960784,0.395623,1.000000,1.000000,1.000000,...,0.870130,1.000000,1.000000,0.952830,0.274914,0.000000,0.333333,0.694767,1.0,49910
6821,2011,80106,South Canberra,0.902778,0.616279,0.256881,0.159292,0.616788,1.000000,1.000000,...,0.867692,1.000000,1.000000,1.000000,0.360424,0.160643,0.000000,0.752381,1.0,25176
6822,2011,80107,Tuggeranong,0.082286,0.000557,0.000000,0.644778,0.973705,0.596556,0.000633,...,0.918682,0.985169,1.000000,1.000000,1.000000,0.978495,0.984556,1.000000,1.0,89319
6823,2011,80108,Weston Creek,1.000000,0.566978,0.058824,0.000000,0.000000,0.026316,0.000000,...,0.000000,0.018939,0.396040,1.000000,1.000000,1.000000,1.000000,0.788462,1.0,23354


### Preprocessing by Splitting Population into Sex-Group

In [12]:
# Split the data by sex
age_groups = ['0-4','5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39','40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74','75-79', '80-84', '85+']
population_m_dict = defaultdict(dict)
population_f_dict = defaultdict(dict)

# save population for each year into lists
for sa3_code in sa3_codes:
  population_m_dict[sa3_code] = dict()
  population_f_dict[sa3_code] = dict()
  for year in range(1991,2012):
    if(rescaled_data[(rescaled_data['Year']==year) & (rescaled_data['SA3 Code']==sa3_code)]['Total'].size>0):
      population_m_dict[sa3_code][year] = rescaled_data[(rescaled_data['Year']==year) & (rescaled_data['SA3 Code']==sa3_code)][['m0-4',
                              'm5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
                              'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
                              'm75-79', 'm80-84', 'm85+']].values.tolist()[0]
      population_f_dict[sa3_code][year] = rescaled_data[(rescaled_data['Year']==year) & (rescaled_data['SA3 Code']==sa3_code)][['f0-4', 'f5-9', 'f10-14', 'f15-19',
                              'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
                              'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+']].values.tolist()[0]

### Create Dataframe for Storing Prediction Result

In [13]:
output = pd.DataFrame(index = range(sa3_num*36), columns = ['Code','Area name','Sex','Age group',2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011])
for sa3_code in range(0,sa3_num):
  for age_group in range(18):
    output.loc[sa3_code*36+age_group] = {'Code':sa3_codes[sa3_code],'Area name':sa3_names[sa3_code],'Sex':'Females','Age group':age_groups[age_group]}
    output.loc[sa3_code*36+18+age_group] = {'Code':sa3_codes[sa3_code],'Area name':sa3_names[sa3_code],'Sex':'Males','Age group':age_groups[age_group]}

### Start LSTM Fitting & Predicting

##### Define Variable for Splitting the Full Dataset

In [14]:
# choose a number of time steps
n_steps = 1

# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 18
n_age_groups = 18

# Obtain the split position for the training sex, validation sex, and the first input_x
train_val_bounds, test_bounds = split_position(n_steps, train_start = 1991, train_end = 2001, fixed_num_val = 2)

##### LSTM Model Defining

In [15]:
'''Function for Defining the LSTM Model'''
def lstm_model(lstm1_units, activ, optimizer, loss_fun):
    model = Sequential()
    model.add(LSTM(lstm1_units, activation=activ, input_shape=(n_steps, n_features)))
    model.add(Dense(n_age_groups))
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=["mean_absolute_percentage_error"])
    return model

##### LSTM Model Tunning

In [16]:
# Define HYPERPARAMETERS
lstm1_min = 100
lstm1_max = 1000
lstm1_step = 100
activ_functions = ["relu"]
optimizers = ["adam","adagrad"]
loss_functions = ["mse"]

'''In-Build Function for LSTM Model Tuning'''
def build_model(hp):
    lstm1_units = hp.Int("LSTM units", min_value=lstm1_min,max_value=lstm1_max,step=lstm1_step)
    activation = hp.Choice("Activation Function", activ_functions)
    optimizer = hp.Choice("Optimizer", optimizers)
    loss_fun = hp.Choice("Loss Function", loss_functions)
    model = lstm_model(lstm1_units, activation, optimizer, loss_fun)
    return model

# Generate the tuned model with keras_tuner package
tuner = keras_tuner.RandomSearch(hypermodel=build_model, objective=keras_tuner.Objective("val_mean_absolute_percentage_error", "min"))

##### Generate Prediction Result

In [None]:
# Generate each sex's Prediction Result of each age-cohort for all areas and store them into csv
output = LSTM_FitPredict(sa3_codes, population_m_dict, n_steps, train_val_bounds, test_bounds, n_features, 1000, "Males", 2002, tuner, output, minima_dict, maxima_dict)
output = LSTM_FitPredict(sa3_codes, population_f_dict, n_steps, train_val_bounds, test_bounds, n_features, 1000, "Females", 2002, tuner, output, minima_dict, maxima_dict)
output.to_csv('Iterative_predict_output_Step1_1000_randomtrain.csv',index=False,header=True)



[[[2134.28398067 2331.02434446 2399.56306206 2147.58457182 1772.18484797
   1746.34308137 1857.43432331 2202.91752757 2263.31935921 2044.70294338
   1806.04621768 1567.22060126 1433.1037516  1198.3802969   816.81448957
    537.17301404  282.89691946  167.02347643]]]
[[[2120.92325746 2321.38911036 2397.46727859 2146.23439215 1766.74906476
   1737.8199075  1824.75139282 2191.52733508 2256.85425387 2036.27716531
   1784.94421557 1545.58460257 1442.66435    1203.10463905  805.53191181
    530.61576617  276.81816872  167.83844024]]]
[[[2119.48037089 2320.54739882 2397.26629104 2145.88570792 1766.07374072
   1736.80387177 1822.25630849 2190.46177352 2256.44409196 2035.21424657
   1783.35530579 1544.45478312 1442.48683767 1202.76668273  804.89228613
    530.1544812   276.69867619  167.86210843]]]
[[[2119.34421378 2320.4575434  2397.24826605 2145.84241564 1765.99666659
   1736.69763989 1822.02233407 2190.36765608 2256.40286045 2035.11036889
   1783.18696223 1544.38767901 1442.47402104 1202.762

