In [None]:
import numpy as np
np.random.seed(1)
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score
import tqdm

In [None]:
def load_df(path):
    df = pd.read_parquet(path)
    features = [column for column in df.columns if 'feature' in column]
    return df, features


def preprocess(df_from_parquet, features):
    # delete all trading oportunities that are not taken into account for 
    # utility score calculation
    df_from_parquet = df_from_parquet.query('weight > 0').reset_index(drop = True)
    return df_from_parquet

data_df, features = load_df('/kaggle/input/janestreet-parquet/train.parquet')
data_df = preprocess(data_df, features)

n_samples = data_df.shape[0]
train_df = data_df[:int(0.9 * n_samples)]
test_df = data_df[int(0.9 * n_samples):int(0.95 * n_samples)]
valid_df = data_df[int(0.95 * n_samples):]

In [None]:
imp = SimpleImputer(missing_values = np.nan , strategy = 'constant', fill_value = 0)

In [None]:
# base agents

class Agent(object):
    # Our policy that maps state to action parameterized by w
    def policy(self, state):     
        raise NotImplementedError('You need to overwrite the policy method.')
        
    def predict(self, *state):
        return self.policy(state)
    
    def train(self, *state):
        return self.policy(state)
    
    def store_reward(self, reward):
        pass

    def update(self):
        pass

    # Vectorized softmax Jacobian
    @staticmethod
    def softmax_grad(softmax):
        s = softmax.reshape(-1,1)
        return np.diagflat(s) - np.dot(s, s.T)


class RandomAgent(Agent):
    def __init__(self, n_actions):
        self.n_actions = n_actions
    
    def policy(self, state):
        return np.random.binomial(1, 1 / self.n_actions)
        
        
class AlwaysTradeAgent(Agent):
    def policy(self, state):
        return 1
    
    
class REINFORCE(Agent):
    def __init__(self, state_dim, n_actions, learning_rate, gamma, train):
        # Init weight
        self.w = np.random.rand(state_dim, n_actions) * 0.1
        self.n_actions = n_actions
        self.lr = learning_rate
        self.g = gamma
        self.grads = []
        self.rewards = []
        self._train = train
                   
    @staticmethod
    def preprocess_state(state):
        return imp.fit_transform(np.array([state]).reshape((1, -1)))
        
    # Our policy that maps state to action parameterized by w
    def policy(self, state):
        exp = np.exp(state.dot(self.w))
        probs = exp / np.sum(exp)
        action = np.random.choice(self.n_actions, p=probs[0])
        return action, probs

    def train(self, state):
        state = self.preprocess_state(state)
        action, probs = self.policy(state)
        dsoftmax = self.softmax_grad(probs)[action,:]
        dlog = dsoftmax / probs[0, action]
        grad = state.T.dot(dlog[None,:])
        self.grads.append(grad)
        return action
    
    def predict(self, state):
        if self._train:
            return self.train(state)
        else:
            state = self.preprocess_state(state)
            return np.argmax(self.policy(state)[1][0])
        
    def store_reward(self, reward):
        # Compute gradient and save with reward in memory for our weight update
        self.rewards.append(reward)

    def update(self):
        for i in range(len(self.grads)):
            # Loop through everything that happend in the episode and update towards the log policy gradient times **FUTURE** reward
            self.w += self.lr * self.grads[i] * sum([r * (self.g ** r) for t, r in enumerate(self.rewards[i:])])
        self.grads = []
        self.rewards = []

In [None]:
# simulation environment

class SimulationEnv(object):
    def __init__(self, df):
        self.predictions = []
        self.rewards = []
        self.utility = 0
        self.ps = []

        self.df = df
        

    def p(self, step_df, agent):
        result = 0
        X = step_df.loc[:, step_df.columns.str.contains('feature')].values
        Y = step_df.eval('weight * resp')
        for i in range(X.shape[0]):
            pred = agent.predict(X[i])
            self.predictions.append(pred)
            reward = pred * Y.iloc[i]
            self.rewards.append(reward)
            agent.store_reward(reward)
            result += reward
            
            agent.update()

        return result

    def simulate(self, agent):
        ps = []
        for i in tqdm.tqdm(range(self.df.date.min(), self.df.date.max() + 1)):
            ps.append(self.p(self.df[self.df.date == i], agent))

        t = np.multiply(np.sum (ps) / np.sqrt(np.sum(np.power(ps, 2))), np.sqrt(250/len(ps)))

        utility = np.multiply(np.min([np.max([t, 0]), 6]), np.sum(ps))
        self.ps = ps
        
        self.utility = utility
        return utility, ps
    
    def reset(self):
        self.predictions = []
        self.rewards = []
        self.utility = 0
        
    def print_results(self):

        pred = self.predictions
        y = (self.df['resp'] > 0) * 1
        
        print(f'utility {self.utility}')
        print(f'precision {precision_score(y, pred)}')
        plt.plot(self.ps);

In [None]:
env = SimulationEnv(train_df)

In [None]:
env.reset()
rd_agent = RandomAgent(2)
env.simulate(rd_agent)

env.print_results()

In [None]:
env.reset()
at_agent = AlwaysTradeAgent()
env.simulate(at_agent)

env.print_results()

In [None]:
env.reset()
re_agent = REINFORCE(130, 2, 0.001, 0.999, True)
env.simulate(re_agent)

env.print_results()

In [None]:
env = SimulationEnv(test_df)

In [None]:
env.reset()
env.simulate(rd_agent)

env.print_results()

In [None]:
env.reset()
env.simulate(at_agent)

env.print_results()

In [None]:
re_agent._train = False

env.reset()
env.simulate(re_agent)

env.print_results()