In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%cd /content/drive/MyDrive/Projects/Project_inf_571

/content/drive/MyDrive/Projects/Project_inf_571


In [None]:
!ls

checkpoints  dqn.py	dqn_v7.pt  q_for_dqn_v1.pt		 train.parquet
DoubleDQN    dqn_v1.pt	project    RL_for_JaneStreet_Data.ipynb  VanillaDQN


In [None]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from torch.distributions import Categorical

import pandas as pd
import numpy as np
import random

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import copy
import fastprogress

import pickle

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Prepare Data

In [None]:
path = './train.parquet'

In [None]:
def load_df(path, 
            # device
            ):
    # if device == 'cuda':
    #     df = cudf.read_csv(path)
    # else:
    #     df = pd.read_csv(path)
    df = pd.read_parquet(path)
        
    features = [column for column in df.columns if 'feature' in column]
    
    return df, features


# load data and features
df, features = load_df(path)

In [None]:
df.shape

(2390491, 138)

In [None]:
len(features)

130

# Run this if you want to train agent
- delete rows with weight == 0
- normalize features
- add ground truth action

In [None]:
def add_actions(df, features):
    f_mean = df[features[1:]].mean()
    f_std = df[features[1:]].std()
    
    # delete all trading oportunities that are not taken into account for 
    # utility score calculation
    df = df.query('weight > 0').reset_index(drop = True)

    # normalize each feature
    df[features[1:]] = df[features[1:]].fillna(f_mean)
    df[features[1:]] = (df[features[1:]] - f_mean) / f_std

    # add the correct action that should be chosen for each trading oportunity
    df['action'] = (df['resp'] > 0).astype('int')
    return df


# add the action column
df = add_actions(df, features)

# Environment

In [None]:
class Env:
    def __init__(self, df, features):
        self.n_samples = df.shape[0]
        self.weight = torch.FloatTensor(df['weight'].values)
        self.resp = torch.FloatTensor(df['resp'].values)
        self.states = torch.FloatTensor(df[features].values)
        self.observation_space = df[features].shape[1]
        self.action_space = 2
        self.idx = 0
        
    def reset(self):
        self.idx = 0
        return self.states[self.idx].view(1, -1)
    
    def step(self, action):
        reward = self.weight[self.idx] * self.resp[self.idx] * action
        self.idx += 1
        if self.idx >= self.n_samples:
            done = True
            self.idx = 0
        else:
            done = False
        info = 0
        return self.states[self.idx].view(1, -1), reward, done, info

# Configurations

In [None]:
class Config:
    def __init__(self, 
                 version = 0,
                 epsilon_start = 1.,
                 epsilon_final = 0.01,
                 epsilon_decay = 8000,
                 gamma = 0.99, 
                 lr = 1e-4, 
                 target_net_update_freq = 1000, 
                 memory_size = 100000, 
                 batch_size = 128, 
                 learning_starts = 5000,
                 max_frames = 10000000): 

        self.version = version
        self.epsilon_start = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.epsilon_by_frame = lambda i: self.epsilon_final + (self.epsilon_start - self.epsilon_final) * np.exp(-1. * i / self.epsilon_decay)

        self.gamma =gamma
        self.lr =lr

        self.target_net_update_freq =target_net_update_freq
        self.memory_size =memory_size
        self.batch_size =batch_size

        self.learning_starts = learning_starts
        self.max_frames = max_frames

# Experience Replay (run for training)

In [None]:
class ExperienceReplay:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        
        states = []
        actions = []
        rewards = []
        next_states = [] 
        dones = []

        for b in batch: 
            states.append(b[0])
            actions.append(b[1])
            rewards.append(b[2])
            next_states.append(b[3])
            dones.append(b[4])

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.memory)

# DQN

In [None]:
class Qmodel(nn.Module): 
    def __init__(self, obs, ac): 
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(obs, 512),
            nn.ReLU(), 
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, ac),
        )

    def forward(self, x): 
        out = self.model(x)
        return out

In [None]:
class DoubleDQN(nn.Module): 
    def __init__(self, obs, ac, config): 
        super().__init__()

        self.q = Qmodel(obs, ac).to(device)
        self.target = Qmodel(obs, ac).to(device)
            
        self.target.load_state_dict(self.q.state_dict())
        self.target.eval()

        self.target_net_update_freq = config.target_net_update_freq
        self.update_counter = 0

    def get_action(self, x):
        self.q.eval()
        x = torch.FloatTensor(x).to(device).view(-1, 130)
        with torch.no_grad(): 
            a = self.q(x).max(1)[1]
        self.q.train()
        if len(x) > 1:
            return a.detach().cpu().numpy()
        return a.item()

    def update_policy(self, adam, memory, params): 
        b_states, b_actions, b_rewards, b_next_states, b_masks = \
            memory.sample(params.batch_size)

        states = torch.FloatTensor(b_states).to(device)
        actions = torch.LongTensor(b_actions).reshape(-1,1).to(device)
        rewards = torch.FloatTensor(b_rewards).reshape(-1,1).to(device)
        next_states = torch.FloatTensor(b_next_states).to(device)
        masks = torch.FloatTensor(b_masks).reshape(-1,1).to(device)

        current_q_values = self.q(states).gather(1, actions)
        with torch.no_grad():
            max_next_q_vals = self.target(next_states).max(1)[0].reshape(-1,1)
        expected_q_vals = rewards + max_next_q_vals * 0.99 * masks
        loss = F.mse_loss(expected_q_vals, current_q_values)

        adam.zero_grad()
        loss.backward()

        for p in self.q.parameters(): 
            p.grad.data.clamp_(-1.,1.)
        adam.step()

        self.update_counter += 1
        if self.update_counter % self.target_net_update_freq == 0: 
            self.update_counter = 0 
            self.target.load_state_dict(self.q.state_dict())
            self.target.eval()

# Training

In [None]:
n_samples = df.shape[0]
df_train = df[:int(0.9 * n_samples)]
df_test = df[int(0.9 * n_samples):int(0.95 * n_samples)]
df_valid = df[int(0.95 * n_samples):]

In [None]:
env = Env(df_train, features)

config = Config(version = 7,
                epsilon_start = 1.,
                epsilon_final = 0.01,
                epsilon_decay = 8000,
                gamma = 0.99, 
                lr = 1e-3, 
                target_net_update_freq = 1000, 
                memory_size = env.n_samples // 100, 
                batch_size = 128, 
                learning_starts = 1000,
                max_frames = env.n_samples)

agent = DoubleDQN(env.observation_space, env.action_space, config)

memory = ExperienceReplay(config.memory_size)

adam = optim.Adam(agent.q.parameters(), lr = config.lr) 

In [None]:
n_episodes = 5 # aka epoches
# ep_reward for epoches
recap = []

for episode in range(n_episodes):
    print('Episode №', episode)    
    s = env.reset()
    # cumulative reward per epoche (sum of resps for all rows in train set 
    # depending on the agents actions) 
    ep_reward = 0. 
    # ep_reward for after each 1000 rows in train set
    rewards = []

    p_bar = tqdm(total = config.max_frames)
    for frame in range(config.max_frames):
        epsilon = config.epsilon_by_frame(frame)
        # epsilon greedy action choise with decreasing temperature
        if np.random.random() > epsilon: 
            action = agent.get_action(s)
        else: 
            action = np.random.randint(0, env.action_space)
        ns, r, ns_is_the_first_frame_now, _ = env.step(action)
        ep_reward += r
        if (frame + 1) % 1000 == 0 or frame == config.max_frames-1:
            print(f'{frame + 1}/{config.max_frames}:', ep_reward, end = '\r')
            rewards.append(ep_reward.item())
        if ns_is_the_first_frame_now:
            recap.append(ep_reward.item())
            p_bar.set_description('Rew: {:.3f}'.format(ep_reward))
            with open(f'rewards_ep{episode}.pkl', 'wb') as f:
                pickle.dump(rewards, f)
            torch.save(agent.state_dict(), f'./checkpoints/dqn_v{config.version}_ep{episode}.pt')
        # mask = 0 if its_the_transition_from_last_to_first_row else 1
        memory.push((s.reshape(-1).numpy().tolist(), action, r, 
            ns.reshape(-1).numpy().tolist(), 0. if ns_is_the_first_frame_now else 1.))
        s = ns  
        p_bar.update(1)
        if episode > 0: 
            agent.update_policy(adam, memory, config)
        elif frame > config.learning_starts:
            agent.update_policy(adam, memory, config)
    p_bar.close()

In [None]:
with open(f'recap_ep{episode}.pkl', 'wb') as f:
    pickle.dump(recap, f)

# Assess and compare models

In [None]:
n_samples = df.shape[0]
df_train = df[:int(0.9 * n_samples)]
df_test = df[int(0.9 * n_samples):int(0.95 * n_samples)]
df_valid = df[int(0.95 * n_samples):]

## Utility scores

In [None]:
def utility_score(df, action):
    weight = df['weight'].values
    resp = df['resp'].values
    date = df['date'].values
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

v1

In [None]:
# CHANGE THIS
version = 1

# does not matter
env = Env(df_test, features)   
# does not matter
config = Config(epsilon_start = 1.,
                epsilon_final = 0.01,
                epsilon_decay = 8000,
                gamma = 0.99, 
                lr = 1e-4, 
                target_net_update_freq = 1000, 
                memory_size = env.n_samples // 100, 
                batch_size = 128, 
                learning_starts = 5000,
                max_frames = env.n_samples)
agent = DoubleDQN(env.observation_space, env.action_space, config)
PATH = f'./dqn_v{version}.pt'
agent.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
agent.eval()

# CHANGE THIS
assess_df = df_valid
states = assess_df[features].values
with torch.no_grad():
    actions = agent.get_action(states)
print(f'For config v{version}:')
print('% of ones for val split = ', sum(actions)/len(actions))
# for config v7
print('Utility score on val = ', utility_score(assess_df, np.array(actions)))

v0

In [None]:
# CHANGE THIS
version = 0
# does not matter
env = Env(df_test, features)   
# does not matter
config = Config(epsilon_start = 1.,
                epsilon_final = 0.01,
                epsilon_decay = 8000,
                gamma = 0.99, 
                lr = 1e-4, 
                target_net_update_freq = 1000, 
                memory_size = env.n_samples // 100, 
                batch_size = 128, 
                learning_starts = 5000,
                max_frames = env.n_samples)
agent = DoubleDQN(env.observation_space, env.action_space, config)
# CHANGE THIS
PATH = f'./dqn_v{version}.pt'
agent.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
agent.eval()

# CHANGE THIS
assess_df = df_test
states = assess_df[features].values
with torch.no_grad():
    actions = agent.get_action(states)

print(f'For config v{version}:')
print('% of ones for test split = ', sum(actions)/len(actions))
# for config v7
print('Utility score = ', utility_score(assess_df, np.array(actions)))

## plot cumulative reward

In [None]:
s = env.reset()
ep_reward = 0. 
# ep_reward for after each 1000 rows in train set
rewards = []

for frame in range(assess_df.shape[0]):
    # epsilon = config.epsilon_by_frame(frame)

    # if np.random.random() > epsilon: 
    #     action = agent.get_action(s)
    # else: 
    #     action = np.random.randint(0, env.action_space)
    action = agent.get_action(s)

    print()
    ns, r, done, infos = env.step(action)

    try:
        ep_reward += r 
    except Exception as e:
        print(str(e))
        print()
        print(action, ns, r, done, infos)
        break

    if done:
        ns = env.reset()
        ep_reward = 0.

    s = ns  
    if (frame + 1) % 1000 == 0:
        print(f'{frame + 1}/{config.max_frames}:', ep_reward, end = '\r')
        rewards.append(ep_reward.item())

In [None]:
plt.figure(figsize=(10,10))
plt.title("Rewards per Episode in test")
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Reward")