### Imports

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
log_interval = 10
gamma = 0.99

### Food class

In [2]:
class Food:
    '''
    This class represents a food item. It has the following attributes:
    '''
    def __init__(self, calories = 0, total_fat = 0, total_carbohydrates = 0, protein = 0):
        self.calories = calories
        self.total_fat = total_fat
        self.total_carbohydrates = total_carbohydrates
        self.protein = protein

### Evaluate wether food was better

In [3]:
def evaluate(human):
    '''
    evaluate the loss of the human's diet
    '''

    # target values
    target_calories = 2000
    target_fat = 70
    target_carbs = 310
    target_protein = 50

    # loss for each attribute
    calories_loss = (target_calories - human.calories)**2
    fat_loss = (target_fat - human.total_fat)**2
    carb_loss = (target_carbs - human.total_carbohydrates)**2
    protein_loss = (target_protein - human.protein)**2

    
    total_loss = calories_loss + fat_loss + carb_loss + protein_loss
    return -total_loss # negative because we want to maximize the loss

### Environment class (Human)

### Food and human initialisation

In [4]:
food1 = Food(400, 14, 60, 10)
food2 = Food(600, 20, 100, 5)
food3 = Food(200, 5, 30, 15)
food4 = Food(300, 10, 50, 20)

actions = np.array([food1, food2, food3, food4])


In [5]:
class Human:
    '''
    This class represents a human. It has the following attributes:
    '''

    # assuming the human starts with the following nutrients
    def __init__(self):
        self.state = Food(np.random.randint(1800, 2200), np.random.randint(60, 80), np.random.randint(280, 340), np.random.randint(40, 60)) # random initialisaiton
        
        self.decay_rate =  np.random.uniform(0.1, 0.3) # random initialisaiton
    
    # eating adds the nutrients of the food to the human's nutrients and decays a part of already existing nutrients
    def step(self, food_id, eat):
        food_id = int(food_id)
        if(eat == 1):
            food = actions[food_id]
        else:
            food = Food(0, 0, 0, 0)

        new_state = Food()
        new_state.calories = self.state.calories + food.calories - self.decay_rate * self.state.calories
        new_state.total_fat = self.state.total_fat + food.total_fat - self.decay_rate * self.state.total_fat
        new_state.total_carbohydrates = self.state.total_carbohydrates + food.total_carbohydrates - self.decay_rate * self.state.total_carbohydrates
        new_state.protein = self.state.protein + food.protein - self.decay_rate * self.state.protein

        reward = evaluate(new_state) - evaluate(self.state) # reward calculation

        random_food = np.random.randint(0, 4)
        return_state = np.array([new_state.calories, new_state.total_fat, new_state.total_carbohydrates, new_state.protein, random_food])
        self.state = new_state # update the state
        
        return return_state, reward

        
        
    def reset(self):
        self.state = Food(2000, 70, 310, 50)
        random_food = np.random.randint(0, 4)

        return_state = np.array([self.state.calories, self.state.total_fat, self.state.total_carbohydrates, self.state.protein, random_food])   
        
        return return_state

### Policy Class

In [6]:
class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(5, 128) # input is just the food index as a number

        # actor's layer
        self.action_head = nn.Linear(128, 2) # 2 possible actions: eat or skip

        # critic's layer
        self.value_head = nn.Linear(128, 1) #  value is the expected reward

        # action & reward buffer
        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        """
        forward of both actor and critic
        """
        x = F.relu(self.affine1(x))

        # actor: choses action to take from state s_t
        # by returning probability of each action
        action_prob = F.softmax(self.action_head(x), dim=-1)

        # critic: evaluates being in the state s_t
        state_values = self.value_head(x)

        # return values for both actor and critic as a tuple of 2 values:
        # 1. a list with the probability of each action over the action space
        # 2. the value from state s_t
        return action_prob, state_values

model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()

### Test

In [7]:
env = Human()

In [8]:

def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)

    # create a categorical distribution over the list of probabilities of actions
    m = Categorical(probs)

    # and sample an action using the distribution
    action = m.sample()

    # save to action buffer
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))

    # the action to take (which food)
    return action.item()


def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values

    # calculate the true value using rewards returned from the environment
    for r in model.rewards[::-1]:
        # calculate the discounted value
        R = r + gamma * R
        returns.insert(0, R)

    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()

        # calculate actor (policy) loss
        policy_losses.append(-log_prob * advantage)

        # calculate critic (value) loss using L1 smooth loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))

    # reset gradients
    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    # perform backprop
    loss.backward()
    optimizer.step()

    # reset rewards and action buffer
    del model.rewards[:]
    del model.saved_actions[:]


def train():
    running_reward = 10
    rewards = []
    calories = []

    # run infinitely many episodes
    for i_episode in range(2000):

        # reset environment and episode reward
        state = env.reset()
        ep_reward = 0
        # for each episode, only run 9999 steps so that we don't
        # infinite loop while learning
        for t in range(1, 1000):
            # select action from policy
            action = select_action(state)
            # take the action
            state, reward= env.step(state[-1], action) # last element of state is the food index
      
            model.rewards.append(reward)
            ep_reward += reward


        # update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        rewards.append(running_reward)
        calories.append(env.state.calories)

        # perform backprop
        finish_episode()
        # log results
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            '''
                have to figure out plotting mechanism
            '''

    plt.plot(rewards)
    plt.show()
    plt.plot(calories)
    plt.show()


train()

Episode 0	Last reward: -847125.91	Average reward: -42346.80
Episode 10	Last reward: -1305183.33	Average reward: -539371.55
Episode 20	Last reward: -1287785.42	Average reward: -762963.58
Episode 30	Last reward: -2371938.80	Average reward: -954432.63
Episode 40	Last reward: -2466625.31	Average reward: -1340724.41
Episode 50	Last reward: -2034409.36	Average reward: -1231025.83
Episode 60	Last reward: -853193.62	Average reward: -1134320.36


KeyboardInterrupt: 