# Other source

source code for reference:
https://medium.com/data-science-in-your-pocket/contextual-bandits-in-reinforcement-learning-explained-with-example-and-codes-3c707142437b

https://medium.com/data-science/an-overview-of-contextual-bandits-53ac3aa45034

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import copy 
from keras.callbacks import EarlyStopping

tf.config.run_functions_eagerly(True)

In [None]:
#states = random states generated for training, 
#total_states = possible states count
def ohe_generator(states,total_states):
    ohe = np.zeros((len(states),total_states))
    for index, array in enumerate(ohe):
        ohe[index][states[index]] = 1
    return ohe

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
class contextual_bandits:
    def __init__(self,states,actions):
        self.states = states
        self.actions = actions
    
    def reward(self,state,action):
        if (state*action)%2==1:
            return 0.5 + 0.05*((state+action)%10)+np.random.rand()*0.1
        else:
            return 0.9 - 0.1*((state+action)%10)+np.random.rand()*0.1
    
    def network(self):
        input_ = Input(shape=(self.states))
        dense1 = Dense(128,activation='relu')(input_)
        dropout1 = Dropout(0.1)(dense1)
        dense2 = Dense(64,activation='relu')(dropout1)
        dropout2 = Dropout(0.1)(dense2)
        dense3 = Dense(self.actions,activation='sigmoid')(dropout2)
        model = Model(input_,dense3)
        
        rms = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
        model.compile(loss="mean_absolute_error", optimizer=rms,metrics="mean_absolute_error")
        return model

In [None]:
batch_size = 128
states = 100
actions = 4

def training():
    cb = contextual_bandits(states,actions)
    model = cb.network()
    sample_states = np.random.choice(range(states),size=batch_size*100)
    state_ohe = ohe_generator(sample_states, states)
    actual_reward = [[cb.reward(x,y) for y in range(cb.actions)] for x in sample_states]
    actual_reward_matrix = np.zeros((len(state_ohe),cb.actions))
    for index,x in enumerate(actual_reward):
                    actual_reward_matrix[index]=np.array(x)
    model.fit(state_ohe,actual_reward_matrix,batch_size=batch_size,epochs=20) 
    return model

In [None]:
state_ohe = ohe_generator(np.array([x for x in range(100)]), states)
estimated_reward = model.predict(state_ohe)

print({x:np.argmax(y) for x,y in enumerate(estimated_reward)})

In [None]:
cb = contextual_bandits(100,4)
print('\nreward for state {}\n'.format(0))
for x in range(4):
    print(cb.reward(0,x))
    
print('\nreward for state {}\n'.format(93))
for x in range(4):
    print(cb.reward(93,x))

# My own code

## Import Libraries

In [96]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

## Hyperparameters

In [133]:
# The number of action/bandit can be taken
# In recommender system's context, it's the number of product/product category that is being put on advertisement
NUM_OF_ACTION = 10

# The number of context affecting users to pick an action
# In recommender system's context, it can be age, budget, country, favorite_brand.
# NUM_OF_CONTEXT = 5
NUM_OF_CONTEXT = 5

# Alpha
ALPHA = 2   # Exploration constant

NUM_OF_TRIALS = 10000

## Create our LinUCB Model class

$$R_t = x_t^⊤ \theta_a + \epsilon$$

In [134]:
class LinUCB_Bandit:
    

    def __init__ (self):
        # True Weight
        self.true_weight = np.random.randn(NUM_OF_CONTEXT)   # Theta_a

        self.feature_covariance_matrix = np.identity(NUM_OF_CONTEXT) # Identity matrix with a dimension of the number of action
        self.weighted_reward_matrix = np.zeros((NUM_OF_CONTEXT,1))

        self.estimated_weight = np.zeros(NUM_OF_CONTEXT)

        self.number_pulled = 0

    def return_reward(self, context = np.random.rand(NUM_OF_CONTEXT)):

        # True weight vector for a particular arm (unknown in real applications)
        # theta_a = np.array([0.5, -0.2, 0.8, 0.3, -0.5])

        # Noise (Gaussian)
        noise = np.random.normal(0, 0.1)  

        # Compute reward
        reward = np.dot(context, self.true_weight) + noise

        self.number_pulled += 1

        return context, reward

    def update_ucb_matrix(self, context, reward):
        
        # Resize context array
        context_resized = np.array([context])

        # Count feature covarianve matrix (A_b)
        self.feature_covariance_matrix += np.matmul(context_resized.transpose(), context_resized)

        # Count weight sum reward (b_b)
        self.weighted_reward_matrix += reward * context_resized.transpose()

        # UCB_value = self.feature_covariance_matrix

        return self.feature_covariance_matrix, self.weighted_reward_matrix
    
    def calculate_ucb_value(self, context):

        # Resize context array
        context_resized = np.array([context])

        # Inverted feature covariance
        inverted_feature_cov = np.linalg.inv(self.feature_covariance_matrix)

        # Count estimated weight
        self.estimated_weight = np.matmul(inverted_feature_cov, self.weighted_reward_matrix)

        expected_reward = np.dot(context_resized, self.estimated_weight)

        confidence_bound = np.sqrt(np.matmul(np.matmul(context_resized, inverted_feature_cov), context_resized.transpose()))
        

        upper_confidence_bound = expected_reward + (ALPHA * confidence_bound)

        return context, context_resized, inverted_feature_cov, self.estimated_weight, expected_reward, confidence_bound, upper_confidence_bound 
        # return upper_confidence_bound
        # np.linalg.inv(linUCB.feature_covariance_matrix)


## Create Linear UCB Bandit Agent 

In [135]:
bandit_list = []

for i in range(NUM_OF_ACTION):

    bandit_list.append(LinUCB_Bandit())

In [None]:
# Check whether the bandit class is created correctly
bandit_list

[<__main__.LinUCB_Bandit at 0x26687db06e0>,
 <__main__.LinUCB_Bandit at 0x26687ad3b10>,
 <__main__.LinUCB_Bandit at 0x26687df0050>,
 <__main__.LinUCB_Bandit at 0x26687ce7a80>,
 <__main__.LinUCB_Bandit at 0x26687ce7950>,
 <__main__.LinUCB_Bandit at 0x26685546570>,
 <__main__.LinUCB_Bandit at 0x266854f7f00>,
 <__main__.LinUCB_Bandit at 0x266854f4d10>,
 <__main__.LinUCB_Bandit at 0x26687d5be50>,
 <__main__.LinUCB_Bandit at 0x26687d5b750>]

In [137]:
for bandit in bandit_list:
    # DEBUG
    print(bandit.true_weight)
    print(bandit.feature_covariance_matrix)
    print(bandit.weighted_reward_matrix)

[-0.71120486  1.07790325  0.28744349  0.50407259 -0.2253527 ]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[-0.16799645 -0.2498843  -1.34311952 -2.19963433 -1.00578204]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[ 0.53519907 -1.25476722  1.57150327 -1.51930776  0.66302652]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[0.78793146 1.1110924  0.30249053 0.37448901 0.57843198]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[ 0.43930206 -0.65059402 -0.06399878 -1.30290346 -0.65720455]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[-0.10117836  0.56018956  0.39417326 -0.47450689  1.40911259]
[[1. 0. 0. 0. 0.]
 [0. 

## Pick the next agent to pull by calculating UCB Value

To calculate UCB Value, we have to specify a current context the user have

In [None]:
# Repeat with number of trials
for i in range(NUM_OF_TRIALS):

    # Step 1: Preparation

    # Generate a random context
    context = np.random.randn(NUM_OF_CONTEXT)
    print(f'Generated context: {context}')

    # For Hardcode (adjust with the number of context)
    # context = [0.5, 1.2]

    # Initialize empty bandit UCB value
    bandit_ucb_value = []


    # Step 2: Calculate UCB Value of each arm
    for bandit in bandit_list:
        
        context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)

        # print(context)
        # print(context_resized)
        # print(inverted_feature_cov)
        # print(estimated_weight)
        # print(expected_reward)
        # print(confidence_bound)
        # print(upper_confidence_abound)

        bandit_ucb_value.append(upper_confidence_bound)

    # Step 3: Take the highest UCB value
    bandit_best_ucb = np.argmax(bandit_ucb_value)
    print(f'Bandit index to pull: {bandit_best_ucb}')

    # Step 4: Pull the arm
    context, reward = bandit_list[bandit_best_ucb].return_reward(context)
    print(f'Earned Reward: {reward}')
    print()

    # Step 5: Update the UCB Matrix of the pulled arm
    bandit_list[bandit_best_ucb].update_ucb_matrix(context, reward)

0
Generated context: [ 1.07918198 -1.04201821  0.25895396 -0.12932881 -1.95239677]
Earned: -1.4896196803644943
0
Generated context: [ 1.14733497  0.77197853 -0.52953343 -0.11474168  0.96721342]
Earned: -0.25684332518874875
1
Generated context: [ 1.41520371 -1.03008366  0.31329663 -0.24627983 -0.77889834]
Earned: 1.0055942674354785
2
Generated context: [-1.33732127  0.53498083  1.07986116 -0.56460952  0.69741679]
Earned: 1.499561095213744
2
Generated context: [-1.09205095  0.26097682 -0.28157475 -0.65780957 -0.30009551]
Earned: -0.5926653200160635
3
Generated context: [ 0.23898906 -1.40962545 -0.33874673  0.02334137 -0.04297938]
Earned: -1.476891778388723
3
Generated context: [-0.07899262  0.25166861 -0.29045273 -1.76785402 -0.65202432]
Earned: -0.9102020513780157
1
Generated context: [ 1.09802362 -1.02049522 -0.20691735  0.29541758  1.20412934]
Earned: -1.563120586262333
4
Generated context: [ 0.48414149 -1.15240815 -0.42954767 -0.95873649 -0.74027901]
Earned: 2.849996815341661
2
Gener

## Check result

Focus on the True weight and estimated weight

In [None]:
# Check the result after multiple iterations
print(f'context: {context}')
print(f'resized: {context_resized}')

for i, bandit in enumerate(bandit_list):
    context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)
    print(f'============ Bandit-{i} ============== number of pull: {bandit.number_pulled}')
    print(f'Inverted Feature Cov (Ab): {inverted_feature_cov}')
    print(f'True weight: {bandit.true_weight}')
    print(f'Estimated weight: {estimated_weight}')
    print(f'Expected Reward: {expected_reward}')
    print(f'Confidence Bound: {confidence_bound}')
    print(f'UCB Value: {upper_confidence_bound}')
    print()
    # print(f'feature Cov Matrix: {bandit.feature_covariance_matrix}')
    # print(f'weighted_reward: {bandit.weighted_reward_matrix}')
    # print(f'estimated weight: {bandit.estimated_weight}')
    # print()

context: [ 0.08649361 -1.32328985  0.3124731  -1.55757708  0.80730519]
resized: [[ 0.08649361 -1.32328985  0.3124731  -1.55757708  0.80730519]]
Inverted Feature Cov (Ab): [[ 2.47791039e-03  1.08433608e-03  8.35111431e-04  3.62199105e-04
   2.12002437e-04]
 [ 1.08433608e-03  1.83288594e-03 -8.64375893e-05 -5.26158622e-04
   1.10383096e-03]
 [ 8.35111431e-04 -8.64375893e-05  2.54456049e-03 -3.32878328e-04
   5.77444352e-04]
 [ 3.62199105e-04 -5.26158622e-04 -3.32878328e-04  2.75899009e-03
  -1.09352622e-03]
 [ 2.12002437e-04  1.10383096e-03  5.77444352e-04 -1.09352622e-03
   5.00029607e-03]]
True weight: [-0.71120486  1.07790325  0.28744349  0.50407259 -0.2253527 ]
Estimated weight: [[-0.71608811]
 [ 1.08125796]
 [ 0.27855729]
 [ 0.4961495 ]
 [-0.21605503]]
Expected Reward: [[-2.35292651]]
Confidence Bound: [[0.1098515]]
UCB Value: [[-2.13322352]]

Inverted Feature Cov (Ab): [[ 7.20842716e-04 -1.12710749e-05  7.37594320e-05 -5.87457409e-05
  -6.49763871e-05]
 [-1.12710749e-05  6.69926021

Notes from ChatGPT:
1. Check whether the self.feature_covariance_matrix update is correct

Instead of using np.matmul like this
`self.feature_covariance_matrix += np.matmul(context_resized.transpose(), context_resized)`

Use this
`self.feature_covariance_matrix += np.outer(context, context)`

2. Fix UCB Calculation
from:
`confidence_bound = np.sqrt(np.matmul(np.matmul(context_resized, inverted_feature_cov), context_resized.transpose()))`

to:
`confidence_bound = np.sqrt(np.dot(context, np.dot(inverted_feature_cov, context)))`


The suggestion from ChatGPT might be wrong, since the code I created is able to estimate the context true weight quite close.

# Hidden code (useless)

In [88]:
# Pull the arm
context, reward = bandit_list[bandit_best_ucb].return_reward(context)
print(f'Generated context: {context}')
print(f'Earned: {reward}')

# Update the UCB Matrix of the pulled arm
bandit_list[bandit_best_ucb].update_ucb_matrix(context, reward)

Generated context: [0.5, 1.2]
Earned: -0.31201530096925756


(array([[1.25, 0.6 ],
        [0.6 , 2.44]]),
 array([[-0.15600765],
        [-0.37441836]]))

In [89]:
print(bandit_list[0].feature_covariance_matrix)
print(bandit_list[0].weighted_reward_matrix)


[[1.25 0.6 ]
 [0.6  2.44]]
[[-0.15600765]
 [-0.37441836]]


In [95]:
# DEBUGGG
print(f'context: {context}')
print(f'resized: {context_resized}')

for i, bandit in enumerate(bandit_list):
    context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)
    print(f'============ Bandit-{i} ============== number of pull: {bandit.number_pulled}')
    print(f'Inverted Feature Cov (Ab): {inverted_feature_cov}')
    print(f'True weight: {bandit.true_weight}')
    print(f'Estimated weight: {estimated_weight}')
    print(f'Expected Reward: {expected_reward}')
    print(f'Confidence Bound: {confidence_bound}')
    print(f'UCB Value: {upper_confidence_bound}')
    print()
    # print(f'feature Cov Matrix: {bandit.feature_covariance_matrix}')
    # print(f'weighted_reward: {bandit.weighted_reward_matrix}')
    # print(f'estimated weight: {bandit.estimated_weight}')
    # print()

context: [1.59441619 0.43739919]
resized: [[1.59441619 0.43739919]]
Inverted Feature Cov (Ab): [[2.18040897e-04 4.76941667e-05]
 [4.76941667e-05 5.51056539e-04]]
True weight: [-1.42916069  0.52952878]
Estimated weight: [[-1.42837013]
 [ 0.52861006]]
Expected Reward: [[-2.04620285]]
Confidence Bound: [[0.02694895]]
UCB Value: [[-1.99230495]]

Inverted Feature Cov (Ab): [[ 0.27158334 -0.09528139]
 [-0.09528139  0.14801412]]
True weight: [ 0.3577058  -0.04912457]
Estimated weight: [[0.22776101]
 [0.06170951]]
Expected Reward: [[0.39013754]]
Confidence Bound: [[0.76539474]]
UCB Value: [[1.92092702]]

Inverted Feature Cov (Ab): [[ 0.37793991 -0.11087077]
 [-0.11087077  0.18691238]]
True weight: [0.99212952 0.03608986]
Estimated weight: [[0.5557568 ]
 [0.10314583]]
Expected Reward: [[0.93122354]]
Confidence Bound: [[0.91755263]]
UCB Value: [[2.76632879]]

Inverted Feature Cov (Ab): [[0.00191356 0.00878636]
 [0.00878636 0.0951716 ]]
True weight: [ 1.76727543 -0.35764693]
Estimated weight: [[ 

In [26]:
# DEBUG
linUCB.update_ucb_matrix(context=[0.5,1.2], reward=[1])

AttributeError: 'list' object has no attribute 'update_ucb_matrix'

In [155]:
# Pull an arm
context,reward = linUCB.return_reward()
print(f'Context: {context}')
print(f'Reward: {reward}')

Context: [0.93135123 0.51710428]
Reward: -0.17244695800782972


In [9]:
# Update feature covariance and weight reward matrix
linUCB.update_ucb_matrix(context=context, reward=reward)

NameError: name 'reward' is not defined

In [10]:
# DEBUG
context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = linUCB.calculate_ucb_value(context=[1.0, 1.2])
print(context)
print(context_resized)
print(inverted_feature_cov)
print(estimated_weight)
print(expected_reward)
print(confidence_bound)
print(upper_confidence_bound)


[1.0, 1.2]
[[1.  1.2]]
[[ 0.9070632  -0.22304833]
 [-0.22304833  0.46468401]]
[[0.18587361]
 [0.44609665]]
[[0.72118959]]
[[1.02024124]]
[[2.76167208]]


In [11]:
# DEBUG
print(np.array([context]))
print(np.array([context]).transpose())

np.matmul(np.array([context]).transpose(), np.array([context]))
# print(np.array(np.newaxis, context).transpose())

[[1.  1.2]]
[[1. ]
 [1.2]]


array([[1.  , 1.2 ],
       [1.2 , 1.44]])

In [None]:
linUCB.feature_covariance_matrix = 

## Update Matrices

In [73]:
# For MAB it's
action_list = [] # Filled with the estimated reward

In [74]:
# For Contextual Bandit we're gonna have
# NxM matrix
# N = Number of context or the feature
# M = Number of actions can be taken (bandit)
action = np.array([[], []], np.int32)

In [89]:
sample_data = {'name': ['product A', 'product B', 'product C', 'product D'],
                'gender': pd.Series(['Male', 'Female', 'Male'], index=[0, 2, 3]),
                'location': pd.Series(['South', 'North', 'West', 'East'], index=[0, 1, 2, 3]),
               }

data = pd.DataFrame(data=sample_data, index=[0, 1, 2, 3])

# From
#         name	 gender	 location
# 0	 product A     Male	    South
# 1  product B	    NaN	    North
# 2	 product C	 Female	     West
# 3	 product D	   Male	     East

# To
#    Gender_Male  Gender_Female  Location_South  Location_North  Location_West  Location_East
# 0            1              0               1               0              0              0
# 1            0              0               0               1              0              0
# 2            0              1               0               0              1              0
# 3            1              0               0               0              0              1

In [90]:
data

Unnamed: 0,name,gender,location
0,product A,Male,South
1,product B,,North
2,product C,Female,West
3,product D,Male,East


In [91]:
# One hot encode the data

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_data = encoder.fit_transform(data[['gender', 'location']])
encoded_columns = encoder.get_feature_names_out(['gender', 'location'])

one_hot_data = pd.DataFrame(encoded_data, columns=encoded_columns)

In [83]:
data

Unnamed: 0,gender_Female,gender_Male,gender_nan,location_East,location_North,location_South,location_West
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
class contextual_bandits:
    def __init__(self, states, actions):

array([[1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.]])

In [None]:
data = [['tom', 10], ['nick', 15], ['juli', 14]]

df = pd.DataFrame(data, columns=[])

TypeError: object of type 'int' has no len()

In [29]:
print(data.shape)
print(data.columns)
print(data["gender"].unique())

# Loop through the number of entry in the data
for i in range(data.shape[0]):
    print(f"{i}2")

def one_hot_encode():
    return

(4, 3)
Index(['name', 'gender', 'location'], dtype='object')
['Male' nan 'Female']
02
12
22
32


In [130]:
A_cov = np.array([[0.0, 0.0],[0.0, 0.0]])
B_cov = np.array([[0.0, 0.0], [0.0, 0.0]])

In [131]:
# context = [0.5, 1.2]
# context = np.array([[0.5, 1.2]])
context = np.random.randn(2)

A_cov += np.matmul(context.transpose(), context)
B_cov += np.outer(context, context)


In [132]:

print(A_cov)
print(B_cov)

[[0.69287448 0.69287448]
 [0.69287448 0.69287448]]
[[0.06309534 0.19933923]
 [0.19933923 0.62977914]]
