# Imports

In [1]:
from collections import defaultdict

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
import keras_tuner as kt

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

# Function definitions

In [2]:
#new

def split_ts_data(data, val_start, test_start):
    year_min = min(data['Year'])
    year_max = max(data['Year'])
    year_range = year_max-year_min
    
    assert (val_start >= year_min) & (test_start >= year_min) & (val_start <= year_max) & (test_start <= year_max), "Parameter out of bounds"
    assert (val_start > year_min) & (test_start > year_min), "Training set is empty."
    assert val_start < test_start, "Validation set is empty."
    assert year_range > 0, "Data contains less than 2 years."
    
    
    train_data = data[(data['Year']<val_start) & (data['Year']<test_start)]
    val_data = data[(data['Year']>=val_start) & (data['Year']<test_start)]
    test_data = data[data['Year']>=test_start]
    
    return train_data, val_data, test_data

In [3]:
def make_dataset(df, input_width, label_width, shift):
    def create_window(tensor):
        #input -> length of time series used for training
        #shift -> how far off prediction is from last input
        #label -> points to predict
        total_window_size = input_width + shift
        label_start = total_window_size - label_width

        input_bounds = slice(0, input_width)
        label_bounds = slice(label_start, None)

        inputs = tensor[:,input_bounds,:]
        labels = tensor[:,label_bounds,:]

        inputs.set_shape([None, input_width, None])
        labels.set_shape([None, label_width, None])

        return inputs, labels
    
    total_window_size = input_width + shift
    
    arr = np.array(df, dtype=np.float32)
    ds = tf.keras.utils.timeseries_dataset_from_array(
      data=arr,
      targets=None,
      sequence_length=total_window_size,
      sequence_stride=1,
      shuffle=False,
      batch_size=32,)
    
    ds = ds.map(create_window)
    
    return ds

In [4]:
def compile_and_fit(model, num_epochs, input_optimizer='adam', input_loss='mse'):
    model.compile(optimizer=input_optimizer, loss=input_loss)
    history = model.fit(x=train_inputs,y=train_labels, batch_size = 32, epochs=num_epochs, validation_data=val_ds, shuffle=False)
    
    return history

In [5]:
def col_dict(np_df):
    return_dict = {col:index for index, col in enumerate(np_df.columns)}
    
    return return_dict

In [6]:
def plot(df, ds, input_width, label_width, shift, model=None, plot_col='10101 m0.4', max_subplots=3):
    #ensure that df and ds match e.g. train_df must be accompanied by train_ds
    col_indices = col_dict(df)
    
    total_window_size = label_width + shift
    input_slice = slice(0,input_width)
    input_indices = np.arange(total_window_size)[input_slice]
    label_start = total_window_size - label_width
    labels_slice = slice(label_start, None)
    label_indices = np.arange(total_window_size)[labels_slice]
    
    inputs = next(iter(ds))[0]
    labels = next(iter(ds))[1]
    plt.figure(figsize=(12, 8))
    plot_col_index = col_indices[plot_col] 
    max_n = min(max_subplots, len(inputs))
    
    for n in range(max_n):
        plt.subplot(max_n, 1, n+1)
        plt.ylabel(plot_col)
        plt.plot(input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)
        
        plt.scatter(label_indices, labels[n, :, plot_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
        
        if model is not None:
          predictions = model(inputs)
          plt.scatter(label_indices, predictions[n, :, plot_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)
            
        if n == 0:
          plt.legend()
        
    plt.xlabel('Year')

# Main code

## Edit parameters here, but do not rename variables

## Read, preprocess data

In [8]:
raw_data = pd.read_csv('../Data/newSA3.csv')



#Parameters
validation_start = 2002
test_start = 2006
#



train_df, val_df, test_df = split_ts_data(raw_data, validation_start, test_start)

train_df = train_df[train_df.columns.difference(["Unnamed: 0","Year"])]
val_df = val_df[val_df.columns.difference(["Unnamed: 0","Year"])]
test_df = test_df[test_df.columns.difference(["Unnamed: 0","Year"])]

In [9]:
## Create datasets

In [10]:
#Parameters
input_width = 2 #data used in prediction
label_width = 1 #points to predict
shift = 1 #how many years away is the last point to predict
#



train_ds = make_dataset(train_df, input_width, label_width, shift)
val_ds = make_dataset(val_df, input_width, label_width, shift)
test_ds = make_dataset(test_df, input_width, label_width, shift)

num_cols = next(iter(train_ds))[0].shape[2]

Metal device set to: Apple M1 Pro


2022-07-31 13:19:45.531275: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-31 13:19:45.531583: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-07-31 13:19:45.660168: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


## Create and fit model

In [11]:
class SACohortModel(kt.HyperModel):
    def build(self,hp):
        #### Hyperparameters
        # add hyperparameters as needed when adding layers
        
        ##layer hyperparameters
        hp_lstm1_units = hp.Choice('units',[10,30,50])
        hp_lstm1_act = hp.Choice('activation', ["relu"])

        ##model hyperparameters -> adjust tf.keras.models type and model.add layers
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.LSTM(units = hp_lstm1_units, 
                                       activation=hp_lstm1_act, 
                                       return_sequences=False))
        
        model.add(tf.keras.layers.Dense(label_width * num_cols))
        model.add(tf.keras.layers.Reshape([label_width,num_cols]))
        
        ##compilation hyperparameters
        hp_epochs = hp.Choice("epochs",[10,20,30])
        hp_input_optimizer = hp.Choice('input_optimizer',["adam", "adadelta"])
        loss_fun = "mse"
        
        ####
        
        #Do not edit
        model.compile(loss = loss_fun)
        
        return model
        #Do not edit

In [12]:
#Parameter
num_epochs = 10
#

train_inputs = next(iter(train_ds))[0]
train_labels = next(iter(train_ds))[1]

val_inputs = next(iter(val_ds))[0]
val_labels = next(iter(val_ds))[1]

test_inputs = next(iter(test_ds))[0]

tuner = kt.RandomSearch(
    SACohortModel(),
    objective='val_loss',
    max_trials=5)

tuner.search(train_inputs, train_labels, epochs = num_epochs, validation_data = (val_inputs, val_labels))

Trial 5 Complete [00h 00m 02s]
val_loss: 4691693.0

Best val_loss So Far: 1078770.75
Total elapsed time: 00h 00m 10s
INFO:tensorflow:Oracle triggered exit


## Create model with above parameters

In [13]:
model_optimizer = 'adam'
loss_fun = 'mse'


np.random.seed(1337)
full_model = tf.keras.models.Sequential()
full_model.add(tf.keras.layers.LSTM(units = 30, 
                                       activation="relu", 
                                       return_sequences=False))
        
full_model.add(tf.keras.layers.Dense(label_width * num_cols))
full_model.add(tf.keras.layers.Reshape([label_width,num_cols]))
compile_and_fit(full_model, num_epochs=20, input_optimizer=model_optimizer, input_loss=loss_fun)

Epoch 1/20


2022-07-31 13:19:56.386305: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20

2022-07-31 13:19:56.668659: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x29a049fd0>

In [14]:
full_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 30)                1407720   
                                                                 
 dense_1 (Dense)             (None, 11700)             362700    
                                                                 
 reshape_1 (Reshape)         (None, 1, 11700)          0         
                                                                 
Total params: 1,770,420
Trainable params: 1,770,420
Non-trainable params: 0
_________________________________________________________________


## Store full model prediction

In [15]:
#1991-2001

full_train_inputs = next(iter(train_ds))[0] #pairs from 1991-2000
full_train_labels = next(iter(train_ds))[1] #1993-2001
full_train_predictions = full_model(full_train_inputs) #1993-2001

In [16]:
full_train_labels

<tf.Tensor: shape=(9, 1, 11700), dtype=float32, numpy=
array([[[2382., 2354., 2126., ...,  209.,  105.,   59.]],

       [[2357., 2351., 2084., ...,  212.,  112.,   65.]],

       [[2318., 2357., 2055., ...,  247.,  116.,   69.]],

       ...,

       [[2132., 2348., 2039., ...,  389.,  163.,   95.]],

       [[2107., 2335., 2029., ...,  439.,  190.,  103.]],

       [[2077., 2259., 2051., ...,  457.,  221.,  119.]]], dtype=float32)>

In [17]:
full_train_predictions

<tf.Tensor: shape=(9, 1, 11700), dtype=float32, numpy=
array([[[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       ...,

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]]], dtype=float32)>

In [18]:
#2002-2005

full_val_inputs = next(iter(val_ds))[0] #pairs 2002,2003 and 2003,2004
full_val_labels = next(iter(val_ds))[1] #2004 and 2005
full_val_predictions = full_model(val_inputs) #2004 and 2005

In [19]:
full_test_inputs = next(iter(test_ds))[0] #pairs from 2006-2010
full_test_labels = next(iter(test_ds))[1] #2008-2011
full_test_predictions = full_model(test_inputs) #2008-2011

In [20]:
train_df

Unnamed: 0,10101 f0.4,10101 f10.14,10101 f15.19,10101 f20.24,10101 f25.29,10101 f30.34,10101 f35.39,10101 f40.44,10101 f45.49,10101 f5.9,...,80109 m45.49,80109 m5.9,80109 m50.54,80109 m55.59,80109 m60.64,80109 m65.69,80109 m70.74,80109 m75.79,80109 m80.84,80109 m85.
0,2400,2347,2277,1820,2152,2315,2185,2146,1859,2495,...,1219,1002,1226,920,784,606,328,180,98,50
1,2392,2344,2194,1830,2067,2339,2204,2139,1954,2480,...,1257,1009,1209,931,794,617,376,200,104,55
2,2382,2354,2126,1813,1977,2329,2225,2139,2037,2451,...,1285,993,1154,963,792,634,413,209,105,59
3,2357,2351,2084,1777,1913,2316,2232,2158,2079,2418,...,1284,988,1123,998,778,649,459,212,112,65
4,2318,2357,2055,1734,1869,2272,2256,2165,2111,2403,...,1277,998,1105,1020,819,648,486,247,116,69
5,2280,2360,2060,1656,1869,2218,2290,2186,2150,2400,...,1244,1015,1112,1051,833,672,513,277,124,74
6,2242,2327,2054,1555,1905,2104,2340,2208,2153,2393,...,1188,985,1117,1086,835,676,527,316,123,79
7,2171,2338,2051,1480,1912,2054,2403,2171,2153,2400,...,1157,975,1127,1083,847,690,572,358,142,88
8,2132,2348,2039,1450,1892,2005,2395,2217,2195,2400,...,1149,975,1123,1051,851,694,598,389,163,95
9,2107,2335,2029,1452,1848,1991,2363,2251,2201,2393,...,1122,967,1163,997,892,717,577,439,190,103


In [21]:
full_train_inputs #pairs from 1991-2000

<tf.Tensor: shape=(9, 2, 11700), dtype=float32, numpy=
array([[[2400., 2347., 2277., ...,  180.,   98.,   50.],
        [2392., 2344., 2194., ...,  200.,  104.,   55.]],

       [[2392., 2344., 2194., ...,  200.,  104.,   55.],
        [2382., 2354., 2126., ...,  209.,  105.,   59.]],

       [[2382., 2354., 2126., ...,  209.,  105.,   59.],
        [2357., 2351., 2084., ...,  212.,  112.,   65.]],

       ...,

       [[2242., 2327., 2054., ...,  316.,  123.,   79.],
        [2171., 2338., 2051., ...,  358.,  142.,   88.]],

       [[2171., 2338., 2051., ...,  358.,  142.,   88.],
        [2132., 2348., 2039., ...,  389.,  163.,   95.]],

       [[2132., 2348., 2039., ...,  389.,  163.,   95.],
        [2107., 2335., 2029., ...,  439.,  190.,  103.]]], dtype=float32)>

In [22]:
#2000-2001 input for predicting 2002
input_2002 = tf.stack([full_train_labels[7,0,:], full_train_labels[8,0,:]],0)
#2001-2002 input for predicting 2003
input_2003 = tf.stack([full_train_labels[8,0,:], full_val_inputs[0,0,:]],0)
#2000-2001 and 2001-2002 inputs as tensor
input_2002_2003 = tf.stack([input_2002,input_2003],0)

input_2002_2003

<tf.Tensor: shape=(2, 2, 11700), dtype=float32, numpy=
array([[[2107., 2335., 2029., ...,  439.,  190.,  103.],
        [2077., 2259., 2051., ...,  457.,  221.,  119.]],

       [[2077., 2259., 2051., ...,  457.,  221.,  119.],
        [2068., 2272., 2072., ...,  452.,  253.,  128.]]], dtype=float32)>

In [23]:
val_df

Unnamed: 0,10101 f0.4,10101 f10.14,10101 f15.19,10101 f20.24,10101 f25.29,10101 f30.34,10101 f35.39,10101 f40.44,10101 f45.49,10101 f5.9,...,80109 m45.49,80109 m5.9,80109 m50.54,80109 m55.59,80109 m60.64,80109 m65.69,80109 m70.74,80109 m75.79,80109 m80.84,80109 m85.
11,2068,2272,2072,1450,1682,2080,2189,2323,2239,2319,...,1114,984,1132,986,960,692,590,452,253,128
12,2033,2317,2078,1466,1622,2116,2172,2394,2217,2299,...,1115,990,1078,1003,920,713,601,454,285,127
13,2035,2372,2058,1481,1585,2089,2152,2419,2220,2256,...,1136,978,1065,993,887,728,584,476,298,129
14,2038,2406,2052,1520,1571,2025,2171,2403,2240,2216,...,1192,979,1027,1030,828,754,599,471,313,156


In [24]:
full_val_inputs #pairs from 2002-2004

<tf.Tensor: shape=(2, 2, 11700), dtype=float32, numpy=
array([[[2068., 2272., 2072., ...,  452.,  253.,  128.],
        [2033., 2317., 2078., ...,  454.,  285.,  127.]],

       [[2033., 2317., 2078., ...,  454.,  285.,  127.],
        [2035., 2372., 2058., ...,  476.,  298.,  129.]]], dtype=float32)>

In [25]:
#2004-2005 input for predicting 2006
input_2006 = tf.stack([full_val_labels[0,0,:],full_val_labels[1,0,:]],0)
#2005-2006 input for predicting 2007
input_2007 = tf.stack([full_val_labels[1,0,:], full_test_inputs[0,0,:]],0)
#2004-2005 and 2005-2006 inputs as tensor
input_2006_2007 = tf.stack([input_2006,input_2007],0)

input_2006_2007

<tf.Tensor: shape=(2, 2, 11700), dtype=float32, numpy=
array([[[2035., 2372., 2058., ...,  476.,  298.,  129.],
        [2038., 2406., 2052., ...,  471.,  313.,  156.]],

       [[2038., 2406., 2052., ...,  471.,  313.,  156.],
        [2005., 2428., 2034., ...,  499.,  325.,  195.]]], dtype=float32)>

In [26]:
full_test_inputs #pairs from 2006-2010

<tf.Tensor: shape=(4, 2, 11700), dtype=float32, numpy=
array([[[2005., 2428., 2034., ...,  499.,  325.,  195.],
        [2011., 2375., 2112., ...,  480.,  356.,  214.]],

       [[2011., 2375., 2112., ...,  480.,  356.,  214.],
        [2044., 2343., 2145., ...,  484.,  361.,  234.]],

       [[2044., 2343., 2145., ...,  484.,  361.,  234.],
        [2109., 2330., 2182., ...,  481.,  363.,  250.]],

       [[2109., 2330., 2182., ...,  481.,  363.,  250.],
        [2170., 2336., 2230., ...,  492.,  360.,  264.]]], dtype=float32)>

In [27]:
test_df

Unnamed: 0,10101 f0.4,10101 f10.14,10101 f15.19,10101 f20.24,10101 f25.29,10101 f30.34,10101 f35.39,10101 f40.44,10101 f45.49,10101 f5.9,...,80109 m45.49,80109 m5.9,80109 m50.54,80109 m55.59,80109 m60.64,80109 m65.69,80109 m70.74,80109 m75.79,80109 m80.84,80109 m85.
15,2005,2428,2034,1570,1566,1936,2246,2337,2292,2232,...,1248,992,1030,1031,795,771,589,499,325,195
16,2011,2375,2112,1542,1583,1875,2288,2247,2352,2210,...,1279,962,1042,988,798,818,599,480,356,214
17,2044,2343,2145,1594,1640,1840,2314,2246,2418,2210,...,1283,954,1068,971,826,783,627,484,361,234
18,2109,2330,2182,1670,1662,1857,2312,2256,2463,2213,...,1296,968,1082,975,852,768,643,481,363,250
19,2170,2336,2230,1722,1691,1883,2274,2328,2474,2217,...,1262,983,1129,962,893,741,674,492,360,264
20,2225,2381,2178,1698,1681,1864,2252,2426,2430,2217,...,1201,974,1194,927,867,756,716,505,414,282


In [28]:
#all-in-one input

all_input = tf.concat([full_train_inputs,input_2002_2003,full_val_inputs,input_2006_2007,full_test_inputs],0)

In [33]:
#predictions for years 1993-2011

result = full_model(all_input) #1993-2011
result

<tf.Tensor: shape=(19, 1, 11700), dtype=float32, numpy=
array([[[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       ...,

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]],

       [[0.02000188, 0.01997657, 0.0200624 , ..., 0.01983915,
         0.01831472, 0.02011553]]], dtype=float32)>

In [34]:
test_df.columns

Index(['10101 f0.4', '10101 f10.14', '10101 f15.19', '10101 f20.24',
       '10101 f25.29', '10101 f30.34', '10101 f35.39', '10101 f40.44',
       '10101 f45.49', '10101 f5.9',
       ...
       '80109 m45.49', '80109 m5.9', '80109 m50.54', '80109 m55.59',
       '80109 m60.64', '80109 m65.69', '80109 m70.74', '80109 m75.79',
       '80109 m80.84', '80109 m85.'],
      dtype='object', length=11700)

In [35]:
Code = []
Sex = []
Age = []
for sets in test_df.columns:
    code = sets.split()[0]
    sex = sets.split()[1][0]
    age = sets.split()[1][1:]
    Code.append(code)
    Sex.append(sex)
    Age.append(age)

In [36]:
year2002_2011 = result[-10:]
year2002_2011 = year2002_2011.numpy()
final_result = []
final_result.append(Code)
final_result.append(Sex)
final_result.append(Age)
for year in year2002_2011:
    print(year)
    final_result.append(year[0])

[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]
[[0.02000188 0.01997657 0.0200624  ... 0.01983915 0.01831472 0.02011553]]


In [40]:
column_name = ['Code','Sex','Age', '2002','2003','2004','2005','2006','2007','2008','2009','2010','2011']
final_df = pd.DataFrame(final_result).T
final_df.columns = column_name
final_df

Unnamed: 0,Code,Sex,Age,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
0,10101,f,0.4,0.020002,0.020002,0.020002,0.020002,0.020002,0.020002,0.020002,0.020002,0.020002,0.020002
1,10101,f,10.14,0.019977,0.019977,0.019977,0.019977,0.019977,0.019977,0.019977,0.019977,0.019977,0.019977
2,10101,f,15.19,0.020062,0.020062,0.020062,0.020062,0.020062,0.020062,0.020062,0.020062,0.020062,0.020062
3,10101,f,20.24,0.019985,0.019985,0.019985,0.019985,0.019985,0.019985,0.019985,0.019985,0.019985,0.019985
4,10101,f,25.29,0.020032,0.020032,0.020032,0.020032,0.020032,0.020032,0.020032,0.020032,0.020032,0.020032
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11695,80109,m,65.69,0.020051,0.020051,0.020051,0.020051,0.020051,0.020051,0.020051,0.020051,0.020051,0.020051
11696,80109,m,70.74,0.020029,0.020029,0.020029,0.020029,0.020029,0.020029,0.020029,0.020029,0.020029,0.020029
11697,80109,m,75.79,0.019839,0.019839,0.019839,0.019839,0.019839,0.019839,0.019839,0.019839,0.019839,0.019839
11698,80109,m,80.84,0.018315,0.018315,0.018315,0.018315,0.018315,0.018315,0.018315,0.018315,0.018315,0.018315


In [None]:
final_df.rename(column:{})