
1. Read data

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
from google.colab import files
data = files.upload()

In [2]:
#raw_data = pd.read_csv('true_1000_fulldata.csv')

In [3]:
raw_data = pd.read_csv('/content/drive/MyDrive/Colab/true_1000_fulldata.csv')


In [None]:
sa3_num = len(raw_data['SA3 Code'].unique())
sa3_names = raw_data['SA3 Name'].unique()

In [6]:
raw_data['m_total'] = raw_data.apply(lambda x: x['m0-4']+x['m5-9']+x['m10-14']+x['m15-19']+
                                               x['m20-24']+x['m25-29']+x['m30-34']+x['m35-39']+
                                               x['m40-44']+x['m45-49']+x['m50-54']+x['m55-59']+
                                               x['m60-64']+x['m65-69']+x['m70-74']+x['m75-79']+
                                               x['m80-84']+x['m85+'], axis=1)

In [7]:
raw_data['f_total'] = raw_data.apply(lambda x: x['f0-4']+x['f5-9']+x['f10-14']+x['f15-19']+
                                               x['f20-24']+x['f25-29']+x['f30-34']+x['f35-39']+
                                               x['f40-44']+x['f45-49']+x['f50-54']+x['f55-59']+
                                               x['f60-64']+x['f65-69']+x['f70-74']+x['f75-79']+
                                               x['f80-84']+x['f85+'], axis=1)

In [8]:
assert((raw_data['m_total']+raw_data['f_total'] == raw_data['Total']).all())

In [10]:
sa3_codes = raw_data['SA3 Code'].unique()

2. Split Train, Validation, Test set

In [11]:
train_data = raw_data[(raw_data['Year']>=1991) & (raw_data['Year']<=2001)]
#val_data = raw_data[(raw_data['Year']>=2002) & (raw_data['Year']<=2005)]
# remove vali 10-10
test_data = raw_data[(raw_data['Year']>=2002) & (raw_data['Year']<=2011)]

In [12]:
# Split the data by sex
age_groups = ['0-4','5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39','40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74','75-79', '80-84', '85+']
population_m_dict = defaultdict(dict)
population_f_dict = defaultdict(dict)

In [13]:
# save population for each year into lists
for sa3_code in sa3_codes:
  population_m_dict[sa3_code] = dict()
  population_f_dict[sa3_code] = dict()
  for year in range(1991,2012):
    if(raw_data[(raw_data['Year']==year) & (raw_data['SA3 Code']==sa3_code)]['Total'].size>0):
      population_m_dict[sa3_code][year] = raw_data[(raw_data['Year']==year) & (raw_data['SA3 Code']==sa3_code)][['m0-4',
                              'm5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
                              'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
                              'm75-79', 'm80-84', 'm85+']].values.tolist()[0]
      population_f_dict[sa3_code][year] = raw_data[(raw_data['Year']==year) & (raw_data['SA3 Code']==sa3_code)][['f0-4', 'f5-9', 'f10-14', 'f15-19',
                              'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
                              'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+']].values.tolist()[0]

In [None]:
pd.DataFrame(population_m_dict)

In [15]:
def split_sequence(sequence, n_steps):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the sequence
		if end_ix > len(sequence)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

## 3.Train and Predict

In [16]:
# univariate lstm example
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [None]:
output = pd.DataFrame(index = range(sa3_num*36), columns = ['Code','Area name','Sex','Age group',2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011])
for sa3_code in range(0,sa3_num):
  for age_group in range(18):
    output.loc[sa3_code*36+age_group] = {'Code':sa3_codes[sa3_code],'Area name':sa3_names[sa3_code],'Sex':'Females','Age group':age_groups[age_group]}
    output.loc[sa3_code*36+18+age_group] = {'Code':sa3_codes[sa3_code],'Area name':sa3_names[sa3_code],'Sex':'Males','Age group':age_groups[age_group]}

In [18]:
# choose a number of time steps
n_steps = 5
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 18
n_age_groups = 18

In [35]:
###
train_lb = None
train_ub = 6
train_bounds = slice(train_lb,train_ub)

val_lb = 6
val_ub = 10
val_bounds = slice(val_lb,val_ub)

test_lb = 11
test_ub = None
test_bounds = test_lb

In [36]:
def lstm_model(lstm1_units, activ, optimizer, loss_fun):
    model = Sequential()
    model.add(LSTM(lstm1_units, activation=activ, input_shape=(n_steps, n_features)))
    model.add(Dense(n_age_groups))
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=["mean_absolute_percentage_error"])
    
    return model

In [37]:
###HYPERPARAMETERS
lstm1_min = 100
lstm1_max = 1000
lstm1_step = 100

activ_functions = ["relu"]
optimizers = ["adam","adagrad"]
loss_functions = ["mse"]

def build_model(hp):
    lstm1_units = hp.Int("LSTM units", min_value=lstm1_min,max_value=lstm1_max,step=lstm1_step)
    activation = hp.Choice("Activation Function", activ_functions)
    optimizer = hp.Choice("Optimizer", optimizers)
    loss_fun = hp.Choice("Loss Function", loss_functions)
    
    model = lstm_model(lstm1_units, activation, optimizer, loss_fun)
    return model
    

In [38]:
import keras_tuner
#tuner_objective = "val_accuracy"


tuner = keras_tuner.RandomSearch(hypermodel=build_model, objective=keras_tuner.Objective("val_mean_absolute_percentage_error", "min"))

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json


In [None]:
for code in sa3_codes:
    male_sample_data = list(population_m_dict[code].values())
    X, y = split_sequence(male_sample_data, n_steps)
    train_x = X[train_bounds]
    train_y = y[train_bounds]
    val_x = X[val_bounds]
    val_y = y[val_bounds]
    test_x = X[test_bounds]
    test_y = y[test_bounds]
    train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], n_features))
    
    tuner.search(train_x, train_y, epochs=200, validation_data=(val_x, val_y), verbose=0)
    best_model = tuner.get_best_models(num_models=2)[0]
    
    history = best_model.fit(train_x, train_y, epochs=200, verbose=0)
    
    x_input = test_x
    x_input = x_input.reshape((1, n_steps, n_features))
    
    prediction_list = []
    
    for iter in range(10):
        prediction = best_model.predict(x_input, verbose=0)
        prediction_list.append(prediction)
        prediction = prediction.reshape(1,1,18)
        output.loc[(output['Code']==code)&(output['Sex']=='Males'),2002+iter]=prediction
        x_input = np.hstack((x_input,prediction)) #add the latest prediction
        x_input = x_input[0][1:].reshape(1,n_steps,n_features)  #delete the first value

In [None]:
for code in sa3_codes:
    female_sample_data = list(population_m_dict[code].values())
    X, y = split_sequence(female_sample_data, n_steps)
    train_x = X[train_bounds]
    train_y = y[train_bounds]
    val_x = X[val_bounds]
    val_y = y[val_bounds]
    test_x = X[test_bounds]
    test_y = y[test_bounds]
    train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], n_features))
    
    tuner.search(train_x, train_y, epochs=200, validation_data=(val_x, val_y), verbose=0)
    best_model = tuner.get_best_models(num_models=2)[0]
    
    history = best_model.fit(train_x, train_y, epochs=200, verbose=0)
    
    x_input = test_x
    x_input = x_input.reshape((1, n_steps, n_features))
    
    prediction_list = []
    
    for iter in range(10):
        prediction = best_model.predict(x_input, verbose=0)
        prediction_list.append(prediction)
        prediction = prediction.reshape(1,1,18)
        output.loc[(output['Code']==code)&(output['Sex']=='Females'),2002+iter]=prediction
        x_input = np.hstack((x_input,prediction)) #add the latest prediction
        x_input = x_input[0][1:].reshape(1,n_steps,n_features)  #delete the first value

In [None]:
output.to_csv('/content/drive/MyDrive/Colab/1000_Iterative_predict_output.csv',index=False,header=True)