In [2]:
# importing the required libraries
library(reinforcelearn)
library(ReinforcementLearning)

In [3]:
# loading the cliff walking environment
env = makeEnvironment("cliff.walking")
env

<CliffWalking>
  Inherits from: <Gridworld>
  Public:
    action.names: 0 1 2 3
    action.space: Discrete
    actions: 0 1 2 3
    clone: function (deep = FALSE) 
    discount: 1
    done: FALSE
    episode: 0
    episode.return: 0
    episode.step: 0
    initial.state: 36
    initialize: function (...) 
    n.actions: 4
    n.states: 48
    n.step: 0
    previous.state: NULL
    reset: function () 
    resetEverything: function () 
    reward: NULL
    rewards: -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
    state: 36
    state.space: Discrete
    states: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21  ...
    step: function (action) 
    terminal.states: 47
    transitions: 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  ...
    visualize: function () 
  Private:
    reset_: function (env) 
    step_: function (env, action) 
    visualize_: function (env) 

In [4]:
# creating the function to query the environment
sequences <- function(iter,env){
    actions <- env$actions
    data <- data.frame(matrix(ncol = 4, nrow = 0))
    colnames(data) <- c("State", "Action", "Reward","NextState")
    env$reset()
    for(i in 1:iter){
    current_state <- env$state
    current_action <- floor(runif(1,0,4))
    current_reward <- env$step(current_action)$reward
    next_state_iter <- env$step(current_action)$state
    iter_data <- cbind("State" = current_state,"Action" = current_action,"Reward"=current_reward,"NextState" = next_state_iter)
    data <- rbind(data,iter_data)
    if(env$done == "TRUE"){
        break;
    }
    }
   return(data) 
}

# getting the data from function

iter <- 1000
observations = sequences(iter,env)  

cols.name <- c("State","Action","NextState")
observations[cols.name] <- sapply(observations[cols.name],as.character)
sapply(observations, class)

# displaying first 20 records
head(observations,20)

State,Action,Reward,NextState
36,1,-1,36
36,3,-1,36
36,0,-1,36
36,0,-1,36
36,2,-1,12
12,3,-1,36
36,2,-1,12
12,3,-1,36
36,2,-1,12
12,1,-1,14


In [5]:
# performing reinforcement learning

control <- list(alpha = 0.2, gamma = 0.4, epsilon = 0.1)

model <- ReinforcementLearning(data = observations, s = "State", a = "Action", r = "Reward",
                               s_new = "NextState", iter = 1, control = control)

In [6]:
# printing the learnt state-action table which contains the Q-value of each state-action pair
print(model)

State-Action function Q
             0           1           2           3
X36 -1.5953639  -1.5952462  -1.4948993  -1.5949936
X37  0.0000000   0.0000000 -20.0000000   0.0000000
X38  0.0000000 -20.0000000 -36.1893567 -49.0059376
X24 -1.4648553  -1.3324273  -1.4812546  -1.4917518
X26 -1.3897756  -0.9741327  -1.3121494  -1.5148258
X28 -0.8636740  -0.8293248  -0.7959076  -1.3823701
X0  -1.5642927  -1.4662367  -1.5676150  -1.5102539
X2  -1.5380473  -1.2780173  -1.4251813  -1.3349818
X12 -1.4644123  -1.2590694  -1.5630369  -1.5862153
X30 -0.9595646  -0.2000000  -0.2288000  -0.5727028
X4  -1.3953639  -0.9776434  -1.3279273  -0.9610399
X14 -1.2198423  -0.8657823  -1.1166604  -0.7378560
X32  0.0000000   0.0000000   0.0000000  -0.3169141
X6  -1.0338219  -0.5904000  -0.7425920  -0.6064000
X16 -0.7560023  -0.3600000  -0.4278656   0.0000000
X8  -0.6791040   0.0000000  -0.6723200   0.0000000
X18 -0.2000000   0.0000000  -0.2288000   0.0000000

Policy
X36 X37 X38 X24 X26 X28  X0  X2 X12 X30  X4 X14 X3

## There is more...

Experience Replay

In [92]:
# getting 100 new sample data from the cliff walking environment
new_observations =  sequences(100,env) 
cols.name <- c("State","Action","NextState")
new_observations[cols.name] <- sapply(new_observations[cols.name],as.character)
sapply(new_observations, class)
head(new_observations)

State,Action,Reward,NextState
36,2,-1,12
12,2,-1,0
0,1,-1,2
2,2,-1,2
2,0,-1,0
0,2,-1,0


In [89]:
# providing our existing RL model as an argument to update the existing policy
newmodel <- ReinforcementLearning(new_observations, 
                                   s = "State", 
                                   a = "Action", 
                                   r = "Reward", 
                                   s_new = "NextState", 
                                   model = model)
print(newmodel)

State-Action function Q
            X0          X1          X2         X3
24  -1.1096681  -1.0985934  -1.1113893 -1.1124119
26  -1.0924338  -1.0325305  -1.1120072 -1.1123957
28  -0.9840692  -0.7796008  -1.1051685 -1.0663833
29   0.0000000   0.0000000   0.0000000 -0.7325490
30  -0.5651880  -0.8773772   0.0000000 -1.0066913
0   -1.1109654  -1.1101782  -1.1109736 -1.1097649
32  -0.8393290  -0.5748472  -0.8657922 -0.6671089
2   -1.1129358  -1.1059532  -1.1101509 -1.1022497
10  -0.2995810  -0.2972000   0.0000000 -0.6556262
33   0.0000000  -0.6556262   0.0000000  0.0000000
11  -0.6556262  -1.0067198  -1.0423613 -0.7520509
34  -0.5449394  -0.6900636   0.0000000  0.0000000
4   -1.1084350  -1.0618690  -1.1059733 -1.0732534
12  -1.1073482  -1.0750293  -1.1110217 -1.1108705
35  -0.6556262   0.0000000  -0.9214708 -0.3520000
5    0.0000000  -0.6556262   0.0000000 -0.6556262
6   -1.0792958  -0.9430099  -0.9793951 -0.6938900
14  -1.1012670  -1.0530209  -1.1042439 -0.7551120
36  -1.1107382  -1.1107402