# Testing Notebook
I will use this model to test a baseline deep reinforcement learning framework. More modularized code will be used in the consequent files in this folder. Please just use this notebook for reference. **All findings will be in the other notebooks and scripts.**

In [5]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import random
from collections import deque
import gdown

In [6]:
# Download the Data from Google Drive to the temporary folder
merged_data_file_id = '1o_EEumVnswul9MVsrdDwBch5rt7JTr0m'
merged_data_url = f'https://drive.google.com/uc?id={merged_data_file_id}'
merged_data_filepath = '../../temporary_files/merged.csv'
gdown.download(merged_data_url, merged_data_filepath, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1o_EEumVnswul9MVsrdDwBch5rt7JTr0m
To: c:\Users\Daniel\Desktop\UMich\capstone\ss24-capstone-team23-datallah-nkitts-steveso\temporary_files\merged.csv
100%|██████████| 438k/438k [00:00<00:00, 6.62MB/s]


'../../temporary_files/merged.csv'

# Deep Q-Network
In this notebook we will experiment with using a deep reinforcement learning model to forecast the Zillow Home Value Index (ZHVI). The temporal cutoff point will be December 31, 2021. Our test data will be the remaining ZHVI indices in 2022. The evaluation metric will be the mean standard error (MSE). Our baseline is a mean baseline model.

## Preprocess data
We will be using our merged dataframe with various macroeconomic factors. A majority of the preprocessing has been done during data collection. The additional preprocessing for this model includes:
- temporal train-test split,
- quantitative column standardization.


In [7]:
# import data
merged = pd.read_csv(merged_data_filepath)
### TEST Chicago FOR NOW ###
city_nm = 'Chicago'
merged = merged[merged.City == city_nm]
### #################### ###
merged.Date = pd.to_datetime(merged.Date)
merged.sort_values('Date', inplace = True)

In [8]:
# create action space
merged['pct_chng'] = merged.ZHVI.pct_change()
# it isn't uncommon to see 7% swings in home value (0.583% month to month)
# so will label anything within 3 - 7% as reasonable increase/decrease
# anything less than that as relatively unchanged
# anything more than that as significant increase/decrease
def conditions(s):
    if s > 0.07/12: return 2
    elif s < -0.07/12: return -2
    elif s >= 0.03/12 and s <= 0.07/12: return 1
    elif s <= -0.03/12 and s >= -0.07/12: return -1
    elif s > -0.03/12 and s < 0.03/12: return 0
# apply conditions
merged['change'] = merged.pct_chng.apply(conditions)
# drop pct_chng and ZHVI so no data leakage
merged.drop(['ZHVI', 'pct_chng'], axis = 1, inplace = True)

In [9]:
# grab numeric columns to scale
numeric_cols = list(merged.drop(['City', 'Date', 'change'], axis = 1).columns)
scaler = MinMaxScaler()

In [10]:
# split based on year
train = merged[merged.Date.dt.year < 2022]
test = merged[merged.Date.dt.year >= 2022]
# scale all data before creating X and y
train_X = scaler.fit_transform(train[numeric_cols].astype(float))
test_X = scaler.fit_transform(test[numeric_cols].astype(float))
train_y = train.change.values
test_y = test.change.values

## Construct Model
I will be using PyTorch and creating my own model class. This will be a single layer neural net with a Q-learning agent. This will serve as my baseline model

In [11]:
# create LSTM
class QNetwork(torch.nn.Module):
  def __init__(self, input_dim, output_dim, hidden_dim, num_layers):
    super().__init__()
    self.lstm = nn.LSTM(input_size = input_dim, hidden_size = hidden_dim,
                        num_layers = num_layers, batch_first = True)
    self.linear = nn.Linear(in_features = hidden_dim, out_features = output_dim)

  def forward(self, state):
    x, _ = self.lstm(state)
    x = x[:, -1, :]
    x = self.linear(x)
    return x

In [12]:
# create time series
class TimeSeries:
  def __init__(self, X, y, window_size):
    self.X = X
    self.y = y
    self.window_size = window_size
    self.current_step = 0
    self.data_len = len(self.X)

  def reset(self):
    self.current_step = self.window_size
    return self.X[:self.current_step, :]

  def step(self, action):
    self.current_step += 1
    done = self.current_step >= self.data_len - 1
    next_state = self.X[self.current_step - self.window_size:self.current_step]
    actual = self.y[self.current_step]
    reward = -abs(actual - action)
    return next_state, reward, done

In [13]:
# create agent
class DQNAgent:
  def __init__(self, input_dim, output_dim, hidden_dim, window_size, lr, gamma, eps, 
               eps_decay, min_eps, memory_size, batch_size, num_layers = 1, seed = None,
               QNetwork = QNetwork):
    self.dqn = QNetwork(input_dim, output_dim, hidden_dim, num_layers)
    self.dqn_target = QNetwork(input_dim, output_dim, hidden_dim, num_layers)
    self.dqn_target.load_state_dict(self.dqn.state_dict())
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_dim = hidden_dim
    self.window_size = window_size
    self.loss_fn = nn.MSELoss()
    self.optim = optim.Adam(self.dqn.parameters(), lr = lr)
    self.gamma = gamma
    self.epsilon = eps
    self.epsilon_decay = eps_decay
    self.min_epsilon = min_eps
    self.batch_size = batch_size
    self.replay_memory_buffer = deque(maxlen = memory_size)
    if seed is None:
        self.rng = np.random.default_rng()
    else:
        self.rng = np.random.default_rng(seed)

  def select_action(self, state):
    if self.rng.uniform() < self.epsilon:
      action = self.rng.choice(self.output_dim)
    else:
      state = torch.from_numpy(state).float().unsqueeze(0)
      self.dqn.eval()
      with torch.no_grad():
          q_values = self.dqn(state)
      self.dqn.train()
      action = torch.argmax(q_values).item()
    return action

  def train(self, s0, a0, r, s1, done):
    self.add_to_replay_memory(s0, a0, r, s1, done)

    if done:
      self.update_epsilon()
      self.target_update()

    if len(self.replay_memory_buffer) < self.batch_size:
      return

    mini_batch = self.get_random_sample_from_replay_mem()
    state_batch = torch.from_numpy(np.stack([i[0] for i in mini_batch])).float()
    action_batch = torch.from_numpy(np.vstack([i[1] for i in mini_batch])).int() #reshape(1, self.batch_size, 1)
    reward_batch = torch.from_numpy(np.vstack([i[2] for i in mini_batch])).float()
    next_state_batch = torch.from_numpy(np.stack([i[3] for i in mini_batch])).float()
    done_list = torch.from_numpy(np.vstack([i[4] for i in mini_batch]).astype(np.uint8)).float()
    
    current_qs = self.dqn(state_batch)
    current_q  = current_qs.gather(1, action_batch.type(torch.int64))
    next_q, _  = self.dqn_target(next_state_batch).max(dim = 1)
    next_q     = next_q.view(self.batch_size, 1)
    Q_targets  = reward_batch + self.gamma * next_q * (1 - done_list)
    loss       = self.loss_fn(current_q, Q_targets.detach())
    self.optim.zero_grad()
    loss.backward()
    self.optim.step()

  def add_to_replay_memory(self, state, action, reward, next_state, done):
    self.replay_memory_buffer.append((state, action, reward, next_state, done))

  def get_random_sample_from_replay_mem(self):
    random_sample = random.sample(self.replay_memory_buffer, self.batch_size)
    return random_sample

  def update_epsilon(self):
    if self.epsilon > self.min_epsilon:
      self.epsilon *= self.epsilon_decay
      self.epsilon = max(self.min_epsilon, self.epsilon)

  def target_update(self):
    self.dqn_target.load_state_dict(self.dqn.state_dict())

In [14]:
# define training looper
def episode_loop(X, y, max_reward = 0, maxlen = 100, window_size = 7, seed = 0, num_layers = 1,
                 hidden_dim = 24, lr = 0.001, gamma = 0.99, eps = 1, eps_decay = 0.995, 
                 min_eps = 0.01, memory_size = 36, batch_size = 12, num_episodes = 600):
  reward_queue = deque(maxlen = maxlen)
  all_rewards = []
  all_rewards_each_step = []
  env = TimeSeries(X, y, window_size)
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  input_dim = X.shape[1]
  output_dim = len(np.unique(y[~np.isnan(y)]))
  agent = DQNAgent(input_dim, output_dim, hidden_dim, window_size, lr, gamma, eps, 
                   eps_decay, min_eps, memory_size, batch_size, num_layers, seed)
  # iterate through episodes and train
  for i in range(num_episodes):
    state = env.reset()
    done = False
    episodic_reward = 0
    episode_rewards = []
    while not done:
      action = agent.select_action(np.squeeze(state))
      next_state, reward, done = env.step(action)
      episode_rewards.append(reward)
      episodic_reward += reward
      agent.train(state, action, reward, next_state, done)
      state = next_state
    all_rewards.append(episodic_reward)
    all_rewards_each_step.append(episode_rewards)
    reward_queue.append(episodic_reward)
    if (i + 1) % 10 == 0 and len(reward_queue) == 100 and (i + 1) % 10 == 0:
      print(f'Training episode {i + 1}, reward: {episodic_reward}', end='')
    elif (i + 1) % 10 == 0: 
      print(f'Training episode {i + 1}, reward: {episodic_reward}')
    if len(reward_queue) == 100:
      avg_reward = sum(reward_queue) / 100
      if (i + 1) % 10 == 0:
          print(f', moving average reward: {avg_reward}')
  print('Average reward over 100 episodes: ', max_reward)
  # return variables for viz
  return all_rewards, all_rewards_each_step, agent

In [15]:
all_rewards, all_rewards_each_step, agent = episode_loop(train_X, train_y)

Training episode 10, reward: -224.0
Training episode 20, reward: -239.0
Training episode 30, reward: -229.0
Training episode 40, reward: -212.0
Training episode 50, reward: -197.0
Training episode 60, reward: -208.0
Training episode 70, reward: -184.0
Training episode 80, reward: -171.0
Training episode 90, reward: -186.0
Training episode 100, reward: -196.0, moving average reward: -212.62
Training episode 110, reward: -179.0, moving average reward: -205.05
Training episode 120, reward: -155.0, moving average reward: -197.47
Training episode 130, reward: -142.0, moving average reward: -189.89
Training episode 140, reward: -135.0, moving average reward: -182.33
Training episode 150, reward: -131.0, moving average reward: -174.69
Training episode 160, reward: -131.0, moving average reward: -167.89
Training episode 170, reward: -143.0, moving average reward: -161.91
Training episode 180, reward: -136.0, moving average reward: -155.89
Training episode 190, reward: -121.0, moving average re

### Save Model & Rewards

In [16]:
agent.dqn

QNetwork(
  (lstm): LSTM(82, 24, batch_first=True)
  (linear): Linear(in_features=24, out_features=5, bias=True)
)

In [17]:
name = 'base_Chicago_single_layer_test'
torch.save(agent.dqn, f'models/{name}.pth')
np.save(f'rewards/averaged/{name}.npy', np.array(all_rewards))
np.save(f'rewards/episodic/{name}.npy', np.array(all_rewards_each_step))

## Test
Use the held out data to test model performance and assess total reward. The closer the reward is to 0, the better. Given the architecture, the reward cannot be positive.

In [18]:
# reload model
loaded_model = torch.load('models/base_Chicago_single_layer.pth')
loaded_model

QNetwork(
  (lstm): LSTM(82, 24, batch_first=True)
  (linear): Linear(in_features=24, out_features=5, bias=True)
)

In [19]:
# create select_action function for testing
def select_action_test(model, state):
    state = torch.from_numpy(np.squeeze(state)).float().unsqueeze(0)
    model.eval()
    with torch.no_grad():
        q_values = model(state)
    action = torch.argmax(q_values).item()
    return action

### Compute Reward on Test Data

In [22]:
# init test env
# test for a year in the future
env_test = TimeSeries(test_X, test_y, window_size = 7)
state = env_test.reset()
done = False
total_reward = 0
# compute total reward
while not done:
    action = select_action_test(loaded_model, state)
    next_state, reward, done = env_test.step(action)
    total_reward += reward
    state = next_state
    print(reward)
print(f"Total reward on new data: {total_reward}")

-1.0
-1.0
-3.0
-2.0
Total reward on new data: -7.0
