<a href="https://colab.research.google.com/github/Sahanaka/fyp-main-blocks/blob/main/fyp_reinforecement_learning_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Imports**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
import urllib.request
import json
import tensorflow as tf

## **Load Classifier Model**

In [None]:
classifier = tf.keras.models.load_model('level_classifier.h5')

## **Utilities**

In [None]:
# Action space
dummy_action_space = [[0.5, 0.3, 0.2], [0.4, 0.3, 0.3], [0.2, 0.4, 0.4]]
dummy_action_space

[[0.5, 0.3, 0.2], [0.4, 0.3, 0.3], [0.2, 0.4, 0.4]]

In [None]:
# Dummy Q table - (Dimension - (# of states x # of actions))
NUM_STATES = 4
NUM_ACTIONS = 3600
Q_table = np.full((NUM_STATES, NUM_ACTIONS), -100000)
Q_table = Q_table.astype(float)
Q_table

array([[-100000., -100000., -100000., ..., -100000., -100000., -100000.],
       [-100000., -100000., -100000., ..., -100000., -100000., -100000.],
       [-100000., -100000., -100000., ..., -100000., -100000., -100000.],
       [-100000., -100000., -100000., ..., -100000., -100000., -100000.]])

In [None]:
Q_table[0]

array([-100000., -100000., -100000., ..., -100000., -100000., -100000.])

In [None]:
mfcc_l9 = [-2.2616078e+02,  1.1042162e+02,  1.7644606e+01,  3.9272282e+01,
        2.5767040e-01,  1.0436158e+01, -1.7775553e+01,  7.2166715e+00,
       -5.6735168e+00,  4.3787255e+00, -1.0765321e+01,  6.8080320e+00,
       -1.2632023e+01,  1.0582538e+00, -8.2771330e+00, -5.5894885e+00,
       -1.1718090e+01,  8.8863030e-02, -5.1185455e+00, -9.2021751e-01,
       -6.1948314e+00,  1.4003986e-01, -3.5775151e+00, -2.6900642e+00,
        4.3989399e-01, -3.2532177e+00, -2.4641218e+00, -3.4447451e+00,
       -2.6360729e-01, -3.2318563e+00, -3.3647339e+00, -2.9836087e+00,
       -2.4917738e+00, -2.9659946e+00, -2.5206430e+00, -1.9369361e+00,
       -5.8292335e-01,  7.2926402e-01,  1.5391219e+00,  2.5633204e+00]

mfcc_l8 = [-5.2294684e+02,  8.4534813e+01,  2.4168377e+01,  3.4451473e+01,
        8.7442207e+00,  1.8378325e+01, -9.9937849e+00,  9.9003544e+00,
       -1.0024719e+01,  6.7565351e+00, -1.0130479e+01,  4.3018141e+00,
       -9.3382597e+00,  2.3299520e+00, -1.0686426e+01, -1.8922451e+00,
       -8.2489166e+00,  1.5566213e+00, -7.0209546e+00, -8.4980339e-02,
       -5.9372201e+00, -1.5920577e+00, -5.3634071e+00, -1.4472146e+00,
       -3.8888531e+00, -9.2686760e-01, -1.2819674e+00, -1.3986009e+00,
       -2.8101578e+00, -3.0583174e+00, -2.8488455e+00, -3.0050807e+00,
       -2.2769306e+00, -2.1759133e+00, -2.0109713e+00, -2.4061668e+00,
       -6.4653403e-01, -2.0239346e+00, -6.7868316e-01, -1.7535905e+00]

mfcc_l7 = [-3.2310739e+02,  1.0664188e+02,  1.4444141e+01,  2.3891953e+01,
        1.0235841e+01,  1.7881434e+00, -6.3062992e+00, -5.2657971e+00,
       -7.4907193e+00, -4.9236417e+00, -6.0691919e+00, -1.3964889e+00,
       -2.7519898e+00, -5.5401284e-01, -5.3687515e+00, -1.6156106e+00,
       -5.1873708e+00,  2.6306129e-01, -5.0289507e+00, -2.2933891e+00,
       -4.0139928e+00, -3.8437397e+00, -3.3339312e+00, -5.7967749e+00,
       -1.9290487e+00, -2.5632689e+00, -3.7700899e+00,  9.4822574e-01,
       -2.7207773e+00, -1.9501311e+00, -2.7910244e+00, -2.1752622e+00,
       -1.3678321e+00, -8.3080065e-01, -1.8515052e+00, -5.3641677e-01,
        2.5348690e+00,  2.9059832e+00,  4.1263595e+00,  4.6352372e+00]

mfcc_l6 = [-4.0882452e+02,  8.7575592e+01,  1.3600564e+01,  2.9316771e+01,
        8.1113062e+00,  9.1173639e+00, -1.4057755e+01,  9.0414810e+00,
       -1.1712031e+01,  4.6915336e+00, -6.9471889e+00,  6.3146310e+00,
       -8.4353857e+00,  1.2940543e+00, -2.7033405e+00,  7.8395933e-01,
       -6.2810631e+00, -3.9431450e+00, -1.5546079e+00, -6.1311955e+00,
       -6.9518204e+00, -2.2822838e+00, -2.4060762e+00, -3.2523236e+00,
       -2.1788366e+00, -1.8065616e+00, -1.3067272e+00, -1.5816931e-01,
       -1.6498529e+00, -2.0879676e+00, -8.7649304e-01, -4.6968803e+00,
       -3.7902091e+00, -4.7349529e+00, -2.8898137e+00, -2.5146618e+00,
       -2.2660463e+00, -2.5108831e+00, -2.1195893e+00, -6.5264070e-01]

In [None]:
mfcc_l6, mfcc_l7, mfcc_l8, mfcc_l9 = np.array(mfcc_l6), np.array(mfcc_l7), np.array(mfcc_l8), np.array(mfcc_l9)

In [None]:
mfccs = [mfcc_l6, mfcc_l7, mfcc_l8, mfcc_l9]

In [None]:
def next_state_predictor_local(mfcc_scaled_features):

    mfcc_scaled_features = mfcc_scaled_features.reshape(1,-1)
  
    result = classifier.predict(mfcc_scaled_features)

    return result
   

In [None]:
res = next_state_predictor_local(mfcc_l6)
res = np.argmax(res)
res

4

In [None]:
def next_state_predictor(mfcc_scaled_features):

    mfcc_scaled_features = mfcc_scaled_features.reshape(1,-1)
  

    data = {
        "input_data": mfcc_scaled_features.tolist()
    }

    body = str.encode(json.dumps(data))

    url = 'https://level.eastus.inference.ml.azure.com/score'

    api_key = 'w8ErgSIDHi0xe1Rf5KkTDMAGxMtXV1So'

    # The azureml-model-deployment header will force the request to go to a specific deployment.
    # Remove this header to have the request observe the endpoint traffic rules
    headers = {'Content-Type': 'application/json',
               'Authorization': ('Bearer ' + api_key), 'azureml-model-deployment': 'default'}

    req = urllib.request.Request(url, body, headers)

    try:
        response = urllib.request.urlopen(req)

        result = response.read()

        result = np.array(result).reshape(-1, 1)

        return result
    except urllib.error.HTTPError as error:
        print("The request failed with status code: " + str(error.code))

        print(error.info())
        print(error.read().decode("utf8", 'ignore'))

        return error.info()


## **Define Environment**

In [None]:
  # Custom ENV

  class CustomEnv:
    def __init__(self):
      self.action_space = dummy_action_space
      self.state_space = ['level6', 'level7', 'level8', 'level9']
      self.reward_space = [-10, 4, 10]

    def step(self, action):
      self.next_state = self.get_next_state(mfccs[random.randint(0, len(mfccs)-1)])
      random_bits = random.getrandbits(1)
      random_bool = bool(random_bits)
      print("Random bool", random_bool)
      return self.next_state, self.get_reward(), random_bool

    def get_response_from_classifier(self, mfccs):
      result = next_state_predictor(mfccs)
      result = np.argmax(result, axis=1)[0]
      return result
    
    def get_response_from_classifier_local(self, mfccs):
      result = next_state_predictor_local(mfccs)
      result = np.argmax(result)
      return result - len(self.state_space)

    def get_next_state(self, mfccs):
      self.result = self.get_response_from_classifier_local(mfccs)
      self.next_state = self.state_space[self.result]
      return self.next_state

    def get_reward(self):
      if self.result == self.current_state_index:
        return self.reward_space[1]
      if self.result < self.current_state_index:
        return self.reward_space[0]
      return self.reward_space[2]

    def reset(self):
      self.current_state = self.state_space[random.randint(0, len(self.state_space)-1)]
      self.current_state_index = self.state_space.index(self.current_state)
      return self.current_state

    def close(self):
      pass

    
    

In [None]:
env = CustomEnv()

In [None]:
a = env.reset()
a

'level9'

In [None]:
env.current_state

'level9'

In [None]:
# Environment Testing
action = env.action_space[random.randint(0, len(env.action_space)-1)]
temp_states = env.step(action)
temp_states

Random bool True


('level8', -10, True)

## **Q - Learning**

In [None]:
NUM_ITERATIONS = 10001

In [None]:
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.01
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

history = []

for i in range(1, NUM_ITERATIONS):
    history.append(Q_table.copy())
    state = env.reset()

    state_index = env.state_space.index(state)

    epochs, penalties, rr, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space[random.randint(0, len(env.action_space)-1)]
            action_index = env.action_space.index(action)
            print("Action explore", action)
        else:
            action_index = np.argmax(Q_table[state_index]) # Exploit learned values
            print("Action exploit", action)

        next_state, reward, done = env.step(action) 

        print("Next values", next_state, reward, done)
        
        old_value = Q_table[state_index, action_index]
        print("Old value", old_value)

        next_max = np.max(Q_table[env.state_space.index(next_state)])
        
        
        
        new_value = ((1 - alpha) * old_value) + (alpha * (reward + gamma * next_max))
        print("New value", new_value)
        
        Q_table[state_index, action_index] = new_value

        print("Q table", Q_table)

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 8500
Current State level9
Current state index 3
Action exploit [0.5, 0.3, 0.2]
Random bool False
Next values level6 -10 False
Old value -4.764068859399567
New value -4.757057270279869
Q table [[ 9.89515009e+00 -2.87063933e+04 -2.62330932e+04 ... -1.00000000e+05
  -1.00000000e+05 -1.00000000e+05]
 [-2.37937101e+04 -2.52379236e+04  5.04731328e+00 ... -1.00000000e+05
  -1.00000000e+05 -1.00000000e+05]
 [-2.89636325e+04 -2.49891930e+04  3.47975794e-01 ... -1.00000000e+05
  -1.00000000e+05 -1.00000000e+05]
 [-4.75705727e+00 -2.17694107e+04 -2.22893456e+04 ... -1.00000000e+05
  -1.00000000e+05 -1.00000000e+05]]
Action exploit [0.5, 0.3, 0.2]
Random bool True
Next values level8 -10 True
Old value -4.757057270279869
New value -4.807398842814337
Q table [[ 9.89515009e+00 -2.87063933e+04 -2.62330932e+04 ... -1.00000000e+05
  -1.00000000e+05 -1.00000000e+05]
 [-2.37937101e+04 -2.52379236e+04  5.04731328e+00 ... -1.00000000e+05
  -1.00000000e+05 -1.00000000e+05]
 [-2.89636325e+04 -2.49891

In [None]:
Q_table

array([[ 9.70421109e+00, -2.18817366e+04, -2.06086955e+04, ...,
        -1.00000000e+05, -1.00000000e+05, -1.00000000e+05],
       [-1.74231216e+04, -2.00279874e+04,  4.49465142e+00, ...,
        -1.00000000e+05, -1.00000000e+05, -1.00000000e+05],
       [-2.18594335e+04, -1.81161936e+04, -3.40077477e-01, ...,
        -1.00000000e+05, -1.00000000e+05, -1.00000000e+05],
       [-5.93203752e+00, -1.65970602e+04, -1.86015781e+04, ...,
        -1.00000000e+05, -1.00000000e+05, -1.00000000e+05]])

In [None]:
Q_table.shape

(4, 3600)

In [None]:
np.argmax(Q_table, axis=1) 

array([0, 2, 2, 0])

In [None]:
history = np.array(history)

In [None]:
history

In [None]:
history.shape

In [None]:
history[-1, 0, 0]

In [None]:
true_Q_value = history[-1, 0, 0]
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
axes[0].set_ylabel("Q-Value$(s_0, a_0)$", fontsize=14)
# axes[0].set_title("Q-Value Iteration", fontsize=14)
axes[1].set_title("Q-Learning", fontsize=14)

for ax, width, history in zip(axes, (NUM_ITERATIONS - 1 , NUM_ITERATIONS-1), (history, history)):
    # ax.plot([0, width], [true_Q_value, true_Q_value], "k--")
    ax.plot(np.arange(width), history[:, 0, 0], "b-", linewidth=2)
    ax.set_xlabel("Iterations", fontsize=14)
    ax.axis([0, width, 0, 20])

In [None]:
plt.plot(history[:, 0, 0], linewidth=2, color='blue')
plt.title("Q Learning", fontsize=16)
plt.xlabel("Iterations")
plt.ylabel("Q-Value$(s_0, a_0)$", fontsize=14)

## **Testing**


In [None]:
def combinationSum(arr, sum):
	ans = []
	temp = []
	
	arr = sorted(list(set(arr)))
	findNumbers(ans, arr, temp, sum, 0)
	return ans

def findNumbers(ans, arr, temp, sum, index):
	
	if(sum == 0):
		ans.append(list(temp))
		return
	
	for i in range(index, len(arr)):
		if(sum - arr[i]) >= 0:
			temp.append(arr[i])
			findNumbers(ans, arr, temp, sum-arr[i], i)
			temp.remove(arr[i])

arr = [1, 2, 3, 4, 5, 6, 7, 8]
sum = 10
ans = combinationSum(arr, sum)

matching_ans = []

# If result is empty, then
if len(ans) <= 0:
	print("empty")
	
for answer in ans:
  if len(answer) == 6:
    matching_ans.append(answer)

print(matching_ans)

[[1, 1, 1, 1, 1, 5], [1, 1, 1, 1, 2, 4], [1, 1, 1, 1, 3, 3], [1, 1, 1, 2, 2, 3], [1, 1, 2, 2, 2, 2]]


In [None]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action_index = np.argmax(Q_table[state_index])
        state, reward, done = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")