In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import math
import pandas as pd

import matplotlib.pyplot as plt

In [22]:
data = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")

data["history_segment"] = pd.Categorical(data["history_segment"])
data["zip_code"] = pd.Categorical(data["zip_code"])
data["channel"] = pd.Categorical(data["channel"])
data["segment"] = pd.Categorical(data["segment"])

one_hot_hs = pd.get_dummies(data["history_segment"], prefix="hs")
one_hot_zc = pd.get_dummies(data["zip_code"], prefix="zc")
one_hot_c = pd.get_dummies(data["channel"], prefix="c")
one_hot_s = pd.get_dummies(data["segment"], prefix="s")

data = pd.concat([data[["recency", "history", "mens", "womens", "newbie", "visit", "conversion", "spend"]], one_hot_hs, one_hot_zc, one_hot_c, one_hot_s], axis=1)

In [23]:
data.iloc[0:2]

Unnamed: 0,recency,history,mens,womens,newbie,visit,conversion,spend,hs_1) $0 - $100,hs_2) $100 - $200,...,"hs_7) $1,000 +",zc_Rural,zc_Surburban,zc_Urban,c_Multichannel,c_Phone,c_Web,s_Mens E-Mail,s_No E-Mail,s_Womens E-Mail
0,10,142.44,1,0,0,0,0,0.0,0,1,...,0,0,1,0,0,1,0,0,0,1
1,6,329.08,1,1,1,0,0,0.0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [25]:
class DFN(nn.Module):
    def __init__(self, context_n, treatment_n, Psi):
        super(DFN, self).__init__()
        
        self.context_n = context_n
        self.treatment_n = treatment_n
        self.Psi = torch.tensor(Psi, dtype=torch.float)
        
        self.lin1 = nn.Linear(context_n, 200)
        self.fc = nn.Linear(200, treatment_n)
        
    def forward(self, xb):
        xb = self.lin1(xb)
        xb = F.relu(xb)
        return self.fc(xb) - self.Psi
    

In [29]:
class Ucmab():
    def __init__(self, context_n, treatment_n, Psi):
        """
            context_n: int, dimension size of the contexts
            traeatment_n: int, amount of possible treatments (including control)
            Psi: vector (size=treatment_n), cost of every treatment (including control)
        """
        
        self.epsilon = .1
        self.learning_rate = .001
        self.memory = deque(maxlen=2000)
        self.C = 10
        
        self.Psi = Psi

        self.model = DFN(action_n, state_n, Psi)
        self.target_model = DFN(action_n, state_n)
        self.target_model.load_state_dict(self.model.state_dict())
        self.opt = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.loss_f = nn.MSELoss()

        self.action_n = action_n
        self.state_n = state_n
        
    def remember(self, state, action, reward):
        self.memory.append((state, action, reward))
    
    def select_action(self, state):
        if np.random.binomial(1, max(self.epsilon, self.epsilon_min)):
            return random.randrange(self.action_n)
        else:
            state = torch.tensor(state, dtype=torch.float)
            q_val = self.model(state)
            return q_val.argmax().item()
    
    def replay(self, batch_size): # this is where we train -> gradient!
        minibatch = random.sample(self.memory, batch_size)
    
        for state, action, reward, next_state, done in minibatch:
            target = reward        

            actual = self.target_model(torch.tensor(state, dtype=torch.float)).data.numpy()
            actual[0][action] = target

            out = self.model(torch.tensor(state, requires_grad=True, dtype=torch.float))
            loss = self.loss_f(out, torch.tensor(actual, requires_grad=True, dtype=torch.float))

            self.opt.zero_grad()
            loss.backward()
            self.opt.step()
    
    def refresh_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())
      
        

In [None]:
class Env:
    def __init__(self, data):
        self.data = data # pandas.df
    
    def sample(self, n):
        s = self.data.sample(n=n)
        return (s[])
        

In [None]:
EPISODES = 10000
#env = gym.make("CartPole-v1")
state_n = 18
action_n = 2
Psi = [.0, .3]
agent = Ucmab(action_n, state_n, Psi)
#done = False
batch_size = 32

windowed_treatments = []
windowed_rewards = []

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_n])
  
    for time in range(500):
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        # punish terminal state?
        next_state = np.reshape(next_state, [1, state_n])
        agent.remember(state, action, reward, next_state, done)        
    
    if done:
        print("episode {}/{}, score: {}, e: {:.2}".format(e, EPISODES, time, agent.epsilon))
        rewards.append(time)
        break
  
  if e%agent.C == 0:
    agent.refresh_target_model()
  
  if len(agent.memory) > batch_size:
    agent.replay(batch_size)