In [3]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import gym
import numpy as np
from util import provide_scores, adjust_data_structure, show_decisions, tsting
import pandas as pd
import seaborn as sns
import sys
import os
from numpy.random import choice

In [14]:
def iteration_over_episode(probability_matrix, reward_matrix, policy, j, episodes, cumulative_reward, iterations_per_state=1000, gamma=0.9):
    iterative_reward = 0
    for tmp in range(iterations_per_state):
        r = 0
        discount = 1
        while True:
            # take step
            i = policy[j]
            # get next step using probability_matrix
            chance = probability_matrix[i][j]
            a = list(range(len(probability_matrix[i][j])))
            s_prime =  choice(a, 1, p=chance)[0]
            # get the score
            r_delta = reward_matrix[j][i] * discount
            discount *= gamma
            r += r_delta
            if s_prime == 0:
                break
        iterative_reward += r

    return iterative_reward

def iteration_over_state(probability_matrix, reward_matrix, policy, total_states, episodes, cumulative_reward, iterations_per_state, gamma):
    for j in range(total_states):
        iterative_reward = 0

        iterative_reward = iteration_over_episode(probability_matrix, reward_matrix, policy, j, episodes, cumulative_reward, iterations_per_state, gamma)
        
        cumulative_reward += iterative_reward

    return cumulative_reward


def testing(probability_matrix, reward_matrix, policy, iterations_per_state=1000, gamma=0.9):
    total_states = probability_matrix.shape[-1]
    episodes = total_states * iterations_per_state

    cumulative_reward = 0

    cumulative_reward = iteration_over_state(probability_matrix, reward_matrix, policy, total_states, episodes, cumulative_reward, iterations_per_state, gamma)
    
    return cumulative_reward / episodes

def value_iteration(probability_matrix, reward_matrix, epsilon, gamma=0.9):
    value_iteration_data_frame = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", "Time", "Reward", "Value Function"])
    for i in epsilon:
        value_iteration = ValueIteration(probability_matrix, reward_matrix, gamma=gamma, epsilon=i, max_iter=int(1e15))
        value_iteration.run()
        r = testing(probability_matrix, reward_matrix, value_iteration.policy)
        value_iteration_data_frame.loc[len(value_iteration_data_frame)] = [float(i), value_iteration.policy, value_iteration.iter, value_iteration.time, r, value_iteration.V]
    return value_iteration_data_frame

def Q_learning(probability_matrix, reward_matrix, gamma=0.9, learning_rate_decay=[0.99], learning_rate_cut_off=[0.001], epsilon=[1.0], epsilon_decay=[0.99], episodes=[1000000]):
    Q_learning_data_frame = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", "Epsilon", "Epsilon Decay", "Reward", "Time", "Policy", "Value Function", "Training Rewards"])
    
    total = 0
    for i in episodes:
        for j in epsilon:
            for k in epsilon_decay:
                for learning_rate_d in learning_rate_decay:
                    for learning_rate_m in learning_rate_cut_off:
                        algo = QLearning(probability_matrix, reward_matrix, gamma, alpha_decay=learning_rate_d, alpha_min=learning_rate_m, epsilon=j, epsilon_decay=k, n_iter=i)
                        algo.run()
                        score = testing(probability_matrix, reward_matrix, algo.policy)
                        total += 1
                        print("{}: {}".format(total, score))
                        scores = [tmp['Reward'] for tmp in algo.run_stats]
                        
                        Q_learning_data_frame.loc[len(Q_learning_data_frame)] = [i, learning_rate_d, learning_rate_m, j, k, score, algo.time, algo.policy, algo.V, scores]

    return Q_learning_data_frame

def run_policy_iteration(probability_matrix, reward_matrix):
    print("Policy Iteration")

    policy_iteration = PolicyIteration(probability_matrix, reward_matrix, gamma=0.9, max_iter=1e6)
    policy_iteration.run()
    policy_iteration_policy = policy_iteration.policy
    policy_iteration_score = testing(probability_matrix, reward_matrix, policy_iteration_policy)
    print(policy_iteration.iter, policy_iteration.time, policy_iteration_score)
    
    display(policy_iteration_policy)


def run_forest_management(probability_matrix, reward_matrix):
    value_iteration_data_frame = value_iteration(probability_matrix, reward_matrix, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
    display(value_iteration_data_frame)
    
    run_policy_iteration(probability_matrix, reward_matrix)
    
    
    print("Q_learning")
    
    learning_rate_decay = [0.99, 0.999]
    learning_rate_cut_off =[0.001, 0.0001]
    i = [10.0, 1.0]
    k = [0.99, 0.999]
    episodes = [1000000, 10000000]
    Q_learning_data_frame = Q_learning(probability_matrix, reward_matrix, gamma=0.9, learning_rate_decay=learning_rate_decay, learning_rate_cut_off=learning_rate_cut_off, epsilon=i, epsilon_decay=k, episodes=episodes)
    
    
    
    testing(probability_matrix,reward_matrix, Q_learning_data_frame.Policy[18])
    
    display(Q_learning_data_frame)
    
    display(Q_learning_data_frame.groupby("Iterations").mean())
    
    display(Q_learning_data_frame.groupby("Epsilon Decay").mean())


if __name__ == "__main__":
    np.random.seed(44)
    
    print("20 States\n\n\n")
    
    probability_matrix, reward_matrix = forest(S=20, r1=10, r2=6, p=0.1)

    run_forest_management(probability_matrix, reward_matrix)
    
    print("500 States\n\n\n")
    
    probability_matrix, reward_matrix = forest(S=500, r1=100, r2= 15, p=0.01)
    
    run_forest_management(probability_matrix, reward_matrix)
    

20 States





Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,0.005009,2.843259,"(4.328504830081768, 4.881518644971712, 4.88151..."
1,0.001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,0.002171,2.842526,"(4.460720290173723, 5.013211594807497, 5.01321..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,0.003533,2.920512,"(4.474643139169861, 5.027129333047953, 5.02712..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",120,0.004727,2.884589,"(4.475122825121185, 5.027609012960728, 5.02760..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",153,0.006271,2.885044,"(4.475137648839068, 5.027623836684378, 5.02762..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",186,0.009037,2.913634,"(4.4751381069387985, 5.027624294784101, 5.0276..."


Policy Iteration
14 0.0066721439361572266 2.8669479098730797


(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

Q_learning
1: 3.2419625985195655
2: 3.45468712711018
3: 2.9725909845507967
4: 3.1199088702065234
5: 1.1
6: 3.4605589330321034
7: 3.1957922700013963
8: 0.8
9: 3.3193457298294207
10: 3.358040492544265
11: 3.052963918650888
12: 3.165223845408274
13: 3.4057947654678107
14: 3.4913868295783455
15: 3.200831840749126
16: 3.1403480871645533
17: 3.3697538256736776
18: 3.3341063241755533
19: 2.8207445098572754
20: 0.85
21: 0.95
22: 3.486291960372364
23: 3.271429634595206
24: 0.95
25: 3.1392181548026974
26: 3.4049870182429904
27: 3.1145079958705306
28: 2.9350139196004505
29: 3.455509245021752
30: 3.4166385707092233
31: 3.3223197902165595
32: 1.0


Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,3.241963,31.702035,"(0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, ...","(4.469516616179936, 5.021068518048445, 5.02559...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,3.454687,31.296608,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(4.439242057739486, 4.991535438072977, 4.02348...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.972591,32.328044,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ...","(4.484346439269151, 5.034010306485361, 5.03050...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,3.119909,31.661542,"(0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, ...","(4.472149848246118, 5.025254217665293, 4.80726...","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,1000000,0.99,0.001,10.0,0.999,1.1,31.660677,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...","(4.475160791588194, 5.030119252396098, 5.03305...","[6.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,3.460559,31.85909,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.4320373705496445, 4.983831695810947, 4.0022...","[1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
6,1000000,0.999,0.001,10.0,0.999,3.195792,31.749441,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, ...","(4.4755272438345965, 5.026659913734769, 5.0244...","[10.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,..."
7,1000000,0.999,0.0001,10.0,0.999,0.8,31.852308,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","(4.47387595179908, 5.02543011236457, 4.8995375...","[0.0, 0.0, 0.0, 0.0, 1.0, 10.0, 0.0, 0.0, 0.0,..."
8,1000000,0.99,0.001,1.0,0.99,3.319346,31.965991,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","(4.4715302899745675, 5.025254103920376, 5.0260...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 10.0, 0.0, 0.0,..."
9,1000000,0.99,0.0001,1.0,0.99,3.35804,31.957316,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...","(4.440501282624428, 4.9917634204759755, 4.0250...","[0.0, 10.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 1.0..."


Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.967465,31.784812
10000000,0.9945,0.00055,5.5,0.9945,2.676283,322.284788


Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,3.040816,177.656724
0.999,0.9945,0.00055,5.5,2.602931,176.412876


500 States





Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",79,0.008641,2.725382,"(4.710556185449387, 5.239434944489701, 5.23943..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",119,0.010274,2.72602,"(4.7117745667154995, 5.240595870281114, 5.2405..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",179,0.01592,2.750285,"(4.711792669916437, 5.240613400253226, 5.24061..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",239,0.02341,2.755154,"(4.711792702216012, 5.240613431989174, 5.24061..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",299,0.026353,2.735265,"(4.711792702273827, 5.240613432046434, 5.24061..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",349,0.03134,2.743204,"(4.7117927022739305, 5.240613432046538, 5.2406..."


Policy Iteration
46 0.15190887451171875 2.7282127846897812


(0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


Q_learning
1: 2.6457202687964307
2: 2.649979984464296
3: 2.6392875993822655
4: 2.6315063673157337
5: 2.6334844975422964
6: 2.662812261587241
7: 2.579535594724274
8: 2.6408281271978216
9: 2.615333529405131
10: 0.854
11: 2.6253267906693583
12: 2.622869833551158
13: 2.64128539283331
14: 2.6388866548579846
15: 2.6272875183965785
16: 2.621154178848485
17: 2.753957755761328
18: 2.8307366870778288
19: 2.770383137892614
20: 2.8005122228172747
21: 2.7597817847014805
22: 2.8389751044306744
23: 2.7384627338584178
24: 2.8055208965447593
25: 2.750629574729021
26: 2.8216984597515604
27: 2.758731007363096
28: 2.7894710568318906
29: 2.7627961451992804
30: 2.831902914125298
31: 2.766254934955373
32: 2.781121279609314


Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,2.64572,48.420391,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...","(4.711608976583683, 5.240736056301092, 5.24235...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,2.64998,48.929765,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...","(4.672435445357042, 5.201171633515619, 4.37127...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.639288,48.745058,"(0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","(4.7127953406989, 5.240646359078045, 5.2397861...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,2.631506,48.816628,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, ...","(4.709399081256882, 5.2385485235715645, 5.0989...","[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1000000,0.99,0.001,10.0,0.999,2.633484,48.870897,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, ...","(4.710522511015649, 5.239877278961145, 5.23978...","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,2.662812,50.872312,"(0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...","(4.667861470968527, 5.196229858663855, 4.36511...","[1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,1000000,0.999,0.001,10.0,0.999,2.579536,49.520571,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, ...","(4.713127913126084, 5.24227256659081, 5.241421...","[1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,2.640828,48.303403,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...","(4.708568498848688, 5.237654220967237, 5.13409...","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.615334,48.022415,"(0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...","(4.709616563111147, 5.239579907160608, 5.24055...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
9,1000000,0.99,0.0001,1.0,0.99,0.854,47.887792,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...","(4.671803722165624, 5.200420854442883, 4.37439...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.520581,48.74366
10000000,0.9945,0.00055,5.5,0.9945,2.785058,483.346224


Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.597509,260.076425
0.999,0.9945,0.00055,5.5,2.708131,272.013459
