In [180]:
from pycaret.classification import * # Preprocessing, modelling, interpretation, deployment...
import pandas as pd # Basic data manipulation
from sklearn.model_selection import train_test_split # Data split
from sdv.tabular import CopulaGAN, GaussianCopula, CTGAN, TVAE # Synthetic data
from sdv.evaluation import evaluate # Evaluate synthetic data
import sdmetrics
import sdv.sdv
import numpy as np
import sklearn

In [181]:
#PyTorch Modules
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [182]:
import torch as T  
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [357]:
T.sigmoid(T.tensor(-1000, dtype=T.float).to(device))

tensor(0.)

In [297]:
INPUT_DIM = 6416
NUM_ACTIONS = 9

In [360]:
class ReplayBuffer():

    def __init__(self, max_size, input_shape=INPUT_DIM, n_actions=NUM_ACTIONS):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))     
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)   
    
    def store_transition(self, state, action, reward, state_, done):

        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)   
        batch = np.random.choice(max_mem, batch_size)
        states = self.state_memory[batch]        
        states_ = self.new_state_memory[batch] 
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]    

        return states, actions, rewards, states_, dones

In [361]:
class CriticNetwork(nn.Module):
    def __init__(self, beta):
        super(CriticNetwork, self).__init__()
        self.input_dims = INPUT_DIM # default was 2
        #fb, insta
        self.fc1_dims = 256    #hidden layers
        self.fc2_dims = 256    #hidden layers
        self.n_actions = NUM_ACTIONS     #fb, insta
        self.fc1 = nn.Linear(2 + 2, self.fc1_dims) #state + action
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.q1 = nn.Linear(self.fc2_dims, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def CriticNetwork(self, state, action):
        q1_action_value = self.fc1(T.cat([state, action], dim=1 )) 
        q1_action_value = F.relu(q1_action_value) 
        q1_action_value = self.fc2(q1_action_value) 
        q1_action_value = F.relu(q1_action_value) 
        q1 = self.q1(q1_action_value) 
        return q1

In [362]:
class ActorNetwork(nn.Module):
    def __init__(self, alpha):
        super(ActorNetwork, self).__init__()
        self.input_dims = INPUT_DIM # default was 2
        self.fc1_dims = 256
        self.fc2_dims = 256
        self.n_actions = NUM_ACTIONS
        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        prob = self.fc1(state)
        prob = F.relu(prob)
        prob = self.fc2(prob)
        prob = F.relu(prob)
        #fixing each agent between 0 and 1 and transforming each action in env
        mu = T.sigmoid(self.mu(prob))
        return mu

In [365]:
noise = np.random.normal(scale=1.0,size=(NUM_ACTIONS))
noise

array([ 1.34969609,  0.85841395, -1.1551336 ,  0.17853389, -0.10926522,
        0.75080421,  0.18862847, -1.31264064, -0.28273341])

In [317]:
noise = np.random.normal(scale=1.0,size=(NUM_ACTIONS))
device = T.device('cpu')
noise_tensor = T.tensor(noise, dtype=T.float).to(device)
row_nums = np.vstack(np.arange(train_df.shape[0], dtype="float64"))
state = np.hstack([row_nums, np.array([[1.0 for i in range(train_df.shape[1])] for j in range(train_df.shape[0])])])
state_tensor = T.tensor(state, dtype=T.float).to(device)
state_tensor + noise_tensor

tensor([[ 7.9573e-01,  7.0418e-01,  2.9419e-01,  ..., -1.1315e+00,
          5.5064e-01,  1.6696e+00],
        [ 1.7957e+00,  7.0418e-01,  2.9419e-01,  ..., -1.1315e+00,
          5.5064e-01,  1.6696e+00],
        [ 2.7957e+00,  7.0418e-01,  2.9419e-01,  ..., -1.1315e+00,
          5.5064e-01,  1.6696e+00],
        ...,
        [ 7.9980e+02,  7.0418e-01,  2.9419e-01,  ..., -1.1315e+00,
          5.5064e-01,  1.6696e+00],
        [ 8.0080e+02,  7.0418e-01,  2.9419e-01,  ..., -1.1315e+00,
          5.5064e-01,  1.6696e+00],
        [ 8.0180e+02,  7.0418e-01,  2.9419e-01,  ..., -1.1315e+00,
          5.5064e-01,  1.6696e+00]])

In [358]:
class Agent(object):
    def __init__(self, alpha, beta, tau, env, input_dims=INPUT_DIM, gamma=0.99, n_actions=NUM_ACTIONS, max_size=1000000,  batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size)
        self.batch_size = batch_size
        self.actor = ActorNetwork(alpha)
        self.critic = CriticNetwork(beta)
        self.target_actor = ActorNetwork(alpha)
        self.target_critic = CriticNetwork(beta)
        
        self.scale = 1.0
        self.noise = np.random.normal(scale=self.scale,size=(n_actions))
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = T.tensor(observation, dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)
        print(mu)
        mu_prime = mu + T.tensor(self.noise,
                                 dtype=T.float).to(self.actor.device)
        self.actor.train()
        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        
        reward = T.tensor(reward, dtype=T.float).to(self.critic.device)
        done = T.tensor(done).to(self.critic.device)
        new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device)
        action = T.tensor(action, dtype=T.float).to(self.critic.device)
        state = T.tensor(state, dtype=T.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma*critic_value_[j]*done[j])
        target = T.tensor(target).to(self.critic.device)
        target = target.view(self.batch_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

In [347]:
sample_size = 1
failed_sampling_reward = -1
class sampling_environment():
    def __init__(self, generator, train_data, test_data, target):
        self.generator = generator
        self.state_matrix = [[1.0 for i in range(train_data.shape[1])] for j in range(train_data.shape[0])]
        self.train_data = train_data
        self.test_data = test_data
        self.target = target
        self.sampled_data = generator.sample(train_data.shape[0])
        self.pycaret_model = "kr"
        regression.setup(train_data.sample(frac=1), 
            target = self.target, 
            test_data = self.test_data,
            fold_strategy = "kfold",
            silent = True,
            verbose = False)
        regresser = regression.create_model(self.pycaret_model, verbose=False)
        predictions = regression.predict_model(regresser, verbose=False)
        target_prediction = predictions["Label"]
        ground_truth = predictions[self.target]
        self.previous_metric = sklearn.metrics.mean_absolute_error(ground_truth, target_prediction)
    def get_reward(self):
        info = ""
        combined_data = pd.concat([self.train_data,self.sampled_data])
        regression.setup(combined_data.sample(frac=1), 
            target = self.target, 
            test_data = self.test_data,
            fold_strategy = "kfold",
            silent = True,
            verbose = False)
        regresser = regression.create_model(self.pycaret_model, verbose=False)
        predictions = regression.predict_model(regresser, verbose=False)
        target_prediction = predictions["Label"]
        ground_truth = predictions[self.target]
        mae = sklearn.metrics.mean_absolute_error(ground_truth, target_prediction)
        delta_metric = self.previous_metric - mae
        # TODO self.previous_metric = mae
        return delta_metric, info # reward
    def update_synthetic_data_row(self, row_index):
        """
        converts row of tabular data into standardized continuous and discrete conditions
        """
        row = self.state_matrix[row_index]
        conditions = {}
        info = ""
        for entry, dtype, column in zip(row, self.train_data.dtypes, self.train_data.columns):
            entry = entry + 1 # TODO check sigmoid -1 to 1
            if entry >= 1:
                entry=None
            if not entry is None:
                converted_entry = None 
                if dtype == "int64":
                    column_max = self.train_data[column].max()+1
                    column_min = self.train_data[column].min()
                    converted_entry = math.floor(entry * (column_max - column_min)) + column_min
                elif dtype == "float64":
                    column_max = self.train_data[column].max()
                    column_min = self.train_data[column].min()
                    converted_entry = entry * (column_max - column_min) + column_min
                elif dtype == "object":
                    column_length = len(self.train_data[column].unique())
                    converted_entry = self.train_data[column].unique()[math.floor(entry * column_length)]
                else:
                    raise ValueError(f"dtype in train_data not supported: {dtype}")
                conditions[column] = converted_entry
        if conditions:
            if len(conditions) == len(self.train_data.columns):
                key_to_remove = random.choice(list(conditions.keys()))
                info = key_to_remove
                conditions.pop(key_to_remove)
            row_data = self.generator.sample(1, conditions=conditions)
        else:
            row_data = self.generator.sample(1)
        self.sampled_data.loc[row_index]= row_data.head(1).squeeze()
        return info
    def get_sampled_data(self):
        return self.sampled_data
    def get_state(self):
        #row_nums = np.vstack(np.arange(self.train_data.shape[0], dtype="float64"))
        return np.array(self.state_matrix).flatten()
    def step(self, act):
        new_state, reward, done, info = self.state_matrix, failed_sampling_reward, False, act
        # TODO handle sampling failure
        # TODO, ensure only valid row entries
        s_row = math.floor((act[0] + 1)/2 * self.train_data.shape[0]) #row value to change
        row_values = act[1:] # synthetic data conditions
        self.state_matrix[s_row] = row_values
        sample_info = self.update_synthetic_data_row(s_row)
        new_state = self.get_state()
        reward, reward_info = self.get_reward() # classification/regression metric
        info = "---".join([sample_info, reward_info])

        done = False # boolean, true if it's time to stop
            
        return new_state, reward, done, info
synthetic_data = gaussian_copula.sample(sample_size)

In [348]:
train_df = pd.read_csv("regression_data/train.csv")
test_df = pd.read_csv("regression_data/test.csv")

In [349]:
env = sampling_environment(gaussian_copula, train_df, test_df, "charges")

In [350]:
agent = Agent(alpha=0.000025, beta=0.00025, tau=0.001, env=env,
              batch_size=64, n_actions=NUM_ACTIONS)

In [359]:
score_history = []
for i in range(1):
    done = False
    score = 0
    for j in range(10):
        obs = env.get_state()
        act = agent.choose_action(obs)
        print(act)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        print(f"reward : {reward}")
    score_history.append(score)

[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -2452.052734375
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -742.70458984375
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -390.32958984375
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -632.77685546875
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -2704.18505859375
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -1224.22021484375
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15427
 -1.795933    0.9816257   0.18642378]
reward : -320.171875
[-1.407935    0.03821114  1.3325202   1.2316835  -0.8849725   2.15

In [None]:
env = OurCustomEnv(sales_function, obs_range, act_range)

agent = Agent(alpha=0.000025, beta=0.00025, tau=0.001, env=env,
              batch_size=64, n_actions=2)

score_history = []
for i in range(1):
    obs = env.get_state()
    done = False
    score = 0
    for j in range(10):
        act = agent.choose_action(obs)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        
    score_history.append(score)

In [17]:
gaussian_copula = sdv.sdv.SDV.load("regression_generators/GuassianCopulaModel.pkl")

In [18]:
synthetic_data = [None]*train_df.shape[1]

In [44]:
from pycaret import regression

In [169]:
len(train_df.columns)

8

In [170]:
env = sampling_environment(gaussian_copula, train_df, test_df, "charges")
new_state, reward, done, info = env.step([0] + [.1]*8)

{'Unnamed: 0': 267, 'age': 27, 'sex': 'female', 'bmi': 23.394000000000002, 'children': 1, 'smoker': 'no', 'region': 'southeast', 'charges': 13416.073738000001}


In [171]:
info

'smoker---'

In [None]:
column_max = len(train_df["bmi"].max()
column_min
converted_entry = train_df["sex"].unique()[math.floor(.3 * column_length)]
print(converted_entry)

In [85]:
test_df.head(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,764,45,female,25.175,2,no,northeast,9095.06825
1,887,36,female,30.02,0,no,northwest,5272.1758
2,890,64,female,26.885,0,yes,northwest,29330.98315
3,1293,46,male,25.745,3,no,northwest,9301.89355
4,259,19,male,31.92,0,yes,northwest,33750.2918
5,1312,34,male,42.9,1,no,southwest,4536.259
6,899,19,female,22.515,0,no,northwest,2117.33885
7,752,64,male,37.905,0,no,northwest,14210.53595
8,1286,28,female,17.29,0,no,northeast,3732.6251
9,707,49,male,28.69,3,no,northwest,10264.4421


In [24]:
state = [[None for i in range(train_df.shape[1])] for j in range(train_df.shape[0])]
action = 








[[None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, None],
 [None, None, None, None, None,