# 1. Read data

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [5]:
raw_data = pd.read_csv('above1000.csv')
sa3_num = len(raw_data['SA3 Code'].unique())

In [6]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,index,Year,S/T Code,S/T Name,GCCSA Code,GCCSA Name,SA4 Code,SA4 Name,SA3 Code,...,f45-49,f50-54,f55-59,f60-64,f65-69,f70-74,f75-79,f80-84,f85+,Total
0,0,1,1991,1.0,New South Wales,12.0,Rest of NSW,101.0,Capital Region,10101.0,...,1859,1591,1412,1382,1252,967,799,493,353,61667
1,1,2,1991,1.0,New South Wales,12.0,Rest of NSW,101.0,Capital Region,10102.0,...,991,745,611,575,544,393,275,177,105,35281
2,2,3,1991,1.0,New South Wales,12.0,Rest of NSW,101.0,Capital Region,10103.0,...,504,413,328,320,258,237,152,97,59,18092
3,3,4,1991,1.0,New South Wales,12.0,Rest of NSW,101.0,Capital Region,10104.0,...,1318,1370,1651,2166,2075,1437,958,455,314,53440
4,4,5,1991,1.0,New South Wales,11.0,Greater Sydney,102.0,Central Coast,10201.0,...,3709,2884,2884,3567,4002,3573,2661,1565,965,135708


In [7]:
raw_data.columns

Index(['Unnamed: 0', 'index', 'Year', 'S/T Code', 'S/T Name', 'GCCSA Code',
       'GCCSA Name', 'SA4 Code', 'SA4 Name', 'SA3 Code', 'SA3 Name', 'm0-4',
       'm5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
       'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
       'm75-79', 'm80-84', 'm85+', 'f0-4', 'f5-9', 'f10-14', 'f15-19',
       'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
       'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+',
       'Total'],
      dtype='object')

In [8]:
raw_data['m_total'] = raw_data.apply(lambda x: x['m0-4']+x['m5-9']+x['m10-14']+x['m15-19']+
                                               x['m20-24']+x['m25-29']+x['m30-34']+x['m35-39']+
                                               x['m40-44']+x['m45-49']+x['m50-54']+x['m55-59']+
                                               x['m60-64']+x['m65-69']+x['m70-74']+x['m75-79']+
                                               x['m80-84']+x['m85+'], axis=1)

In [9]:
raw_data['f_total'] = raw_data.apply(lambda x: x['f0-4']+x['f5-9']+x['f10-14']+x['f15-19']+
                                               x['f20-24']+x['f25-29']+x['f30-34']+x['f35-39']+
                                               x['f40-44']+x['f45-49']+x['f50-54']+x['f55-59']+
                                               x['f60-64']+x['f65-69']+x['f70-74']+x['f75-79']+
                                               x['f80-84']+x['f85+'], axis=1)

In [10]:
assert((raw_data['m_total']+raw_data['f_total'] == raw_data['Total']).all())

In [11]:
raw_data = raw_data[['Unnamed: 0', 'index', 'Year', 'SA3 Code', 'SA3 Name', 'm0-4',
       'm5-9', 'm10-14', 'm15-19', 'm20-24', 'm25-29', 'm30-34', 'm35-39',
       'm40-44', 'm45-49', 'm50-54', 'm55-59', 'm60-64', 'm65-69', 'm70-74',
       'm75-79', 'm80-84', 'm85+', 'f0-4', 'f5-9', 'f10-14', 'f15-19',
       'f20-24', 'f25-29', 'f30-34', 'f35-39', 'f40-44', 'f45-49', 'f50-54',
       'f55-59', 'f60-64', 'f65-69', 'f70-74', 'f75-79', 'f80-84', 'f85+',
       'm_total', 'f_total', 'Total']]

In [12]:
sa3_codes = raw_data['SA3 Code'].unique()

# 2. Split Train, Validation, Test set

In [13]:
train_data = raw_data[(raw_data['Year']>=1991) & (raw_data['Year']<=2001)]
val_data = raw_data[(raw_data['Year']>=2002) & (raw_data['Year']<=2005)]
# remove vali 10-10
test_data = raw_data[(raw_data['Year']>=2006) & (raw_data['Year']<=2011)]

In [22]:
# Create population dict for each region each year
population_per_year_dict = defaultdict(dict)
for sa3_code in sa3_codes:
    population_per_year_dict[sa3_code] = dict()
    for year in range(1991,2012):
        if(raw_data[(raw_data['Year']==year) & (raw_data['SA3 Code']==sa3_code)]['Total'].size>0):
            population_per_year_dict[sa3_code][year] = raw_data[(raw_data['Year']==year) & (raw_data['SA3 Code']==sa3_code)]['Total'].values[0]

In [23]:
pd.DataFrame(population_per_year_dict)

Unnamed: 0,10101.0,10102.0,10103.0,10104.0,10201.0,10202.0,10301.0,10302.0,10303.0,10304.0,...,70205.0,80101.0,80103.0,80105.0,80106.0,80107.0,80108.0,80109.0,80104.0,90101.0
1991,61667,35281,18092,53440,135708,104122,36738,60023,43227,48674,...,15699,89443,1815,39850,20955,74412,27118,33921,,
1992,61751,36409,18263,54889,139242,107654,37164,60213,43364,49161,...,16178,88685,1795,39696,20955,81051,26663,33967,,
1993,61706,37298,18362,55973,142034,110404,37509,60035,43381,49512,...,16642,87620,1789,39210,21156,85293,26038,33666,3755.0,
1994,61651,38190,18477,57127,144430,113134,37984,60038,43409,49625,...,17127,86181,1745,38961,21262,87886,25557,33311,6746.0,1501.0
1995,61602,38936,18568,57799,147027,115558,38442,59652,43427,49588,...,17555,85706,1652,38596,21481,89386,25047,33202,10233.0,1644.0
1996,61786,39563,18699,58677,149899,118851,38792,59383,43392,49636,...,18038,85987,1587,38617,21618,90507,24874,33192,12741.0,1832.0
1997,61968,40322,18589,59581,151861,121670,39149,59308,43371,50036,...,18231,85246,1561,38332,21911,90439,24472,32827,15234.0,1675.0
1998,62283,40883,18706,60599,153483,124406,39518,59248,43345,50836,...,18497,85049,1519,38033,21924,90387,24090,32673,17344.0,1607.0
1999,62576,42104,18826,61743,155897,127424,39979,59152,43624,51464,...,18703,85041,1474,37995,22101,90620,23969,32644,19807.0,1545.0
2000,62952,44026,18748,62960,158288,130656,40308,59010,43593,51902,...,18860,85577,1441,38242,22157,91058,23749,32627,21860.0,1472.0


In [24]:
def split_sequence(sequence, n_steps):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the sequence
		if end_ix > len(sequence)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [54]:
# define input sequence
sample_data = list(population_per_year_dict[10101.0].values())
# choose a number of time steps
n_steps = 5
# split into samples
X, y = split_sequence(sample_data, n_steps)
# summarize the data
for i in range(len(X)):
	print(X[i], y[i])

[61667 61751 61706 61651 61602] 61786
[61751 61706 61651 61602 61786] 61968
[61706 61651 61602 61786 61968] 62283
[61651 61602 61786 61968 62283] 62576
[61602 61786 61968 62283 62576] 62952
[61786 61968 62283 62576 62952] 63370
[61968 62283 62576 62952 63370] 63784
[62283 62576 62952 63370 63784] 64245
[62576 62952 63370 63784 64245] 64550
[62952 63370 63784 64245 64550] 64891
[63370 63784 64245 64550 64891] 65284
[63784 64245 64550 64891 65284] 65807
[64245 64550 64891 65284 65807] 66670
[64550 64891 65284 65807 66670] 67745
[64891 65284 65807 66670 67745] 68950
[65284 65807 66670 67745 68950] 69775


In [55]:
# train data
train_x = X[:6]
train_y = y[:6]
# validation data
val_x = X[6:10]
val_y = y[6:10]
# test data
test_x = X[10:]
test_y = y[10:]

In [56]:
# univariate lstm example
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [62]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], n_features))
# define model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit(train_x, train_y, epochs=200, verbose=0)

# demonstrate prediction
x_input = val_x
for i in range(len(x_input)):
    val_x = x_input[i].reshape((1, n_steps, n_features))
    yhat = model.predict(val_x, verbose=0)
    print(yhat)





2022-07-09 14:13:30.535132: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[[63148.94]]
[[63507.652]]
[[63890.605]]
[[64290.35]]


2022-07-09 14:13:37.412380: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [63]:
val_y

array([63784, 64245, 64550, 64891])