# Contextual Bandit with Continuous Context and Reward

## Import Libraries

In [25]:
import numpy as np
import pandas as pd

# from sklearn.preprocessing import OneHotEncoder

## Hyperparameters

In creating a contextual bandit agent, we have some parameters that we need to degine, which are:
* `NUM_OF_ACTION` = number of bandit can be chosen
* `NUM_OF_CONTEXT` = number of feature affecting the reward
* `ALPHA` = coefficient that affects the bandit's tendency to explore (works by multiplying the confidence bound and increase its proportion compared to the expected reward)
* `NUM_OF_TRIALS` = how many times we take action for simulation to calculate the estimate of each bandit's return

In [26]:
NUM_OF_ACTION = 10
NUM_OF_CONTEXT = 5

ALPHA = 2

NUM_OF_TRIALS = 10000

In [None]:
# Logging for regret calculation

# Best expected reward for each round
best_expected_reward = []

# Actual reward
result_history = []

# Random Exploration
random_reward = []

## Create our LinUCB Model class

$$R_t = x_t^⊤ \theta_a + \epsilon$$

In [None]:
class LinUCB_Bandit:
    
    def __init__ (self):
        # True Weight (hidden from the user)
        self.true_weight = np.random.randn(NUM_OF_CONTEXT)   # Theta_a


        self.feature_covariance_matrix = np.identity(NUM_OF_CONTEXT) # Identity matrix with a dimension of the number of action
        self.weighted_reward_matrix = np.zeros((NUM_OF_CONTEXT,1))

        # Estimation of the true weight, should get closer to the true weight by the end of the trials
        self.estimated_weight = np.zeros(NUM_OF_CONTEXT)

        self.number_pulled = 0

    def return_reward(self, context = np.random.rand(NUM_OF_CONTEXT)):

        # True weight vector for a particular arm (unknown in real applications)
        # theta_a = np.array([0.5, -0.2, 0.8, 0.3, -0.5])

        # Noise (Gaussian)
        noise = np.random.normal(0, 0.1)  

        # Compute reward
        reward = np.dot(context, self.true_weight) + noise

        self.number_pulled += 1

        # REGRET
        result_history.append(reward)

        return context, reward

    def update_ucb_matrix(self, context, reward):
        
        # Resize context array
        context_resized = np.array([context])

        # Count feature covarianve matrix (A_b)
        self.feature_covariance_matrix += np.matmul(context_resized.transpose(), context_resized)

        # Count weight sum reward (b_b)
        self.weighted_reward_matrix += reward * context_resized.transpose()

        inverted_feature_cov = np.linalg.inv(self.feature_covariance_matrix)
        self.estimated_weight = np.matmul(inverted_feature_cov, self.weighted_reward_matrix)

        # return self.feature_covariance_matrix, self.weighted_reward_matrix
    
    def calculate_ucb_value(self, context):

        # Resize context array
        context_resized = np.array([context])

        # Inverted feature covariance
        # inverted_feature_cov = c

        # Count estimated weight
        # self.estimated_weight = np.matmul(inverted_feature_cov, self.weighted_reward_matrix)

        # Expected reward of the bandit with respect to the current context
        expected_reward = np.dot(context_resized, self.estimated_weight)

        # Confidence bound
        confidence_bound = np.sqrt(np.matmul(np.matmul(context_resized, np.linalg.inv(self.feature_covariance_matrix)), context_resized.transpose()))
        
        # Getting Upper confidence bound by adding the confidence bound times by alpha (multiplier)
        upper_confidence_bound = expected_reward + (ALPHA * confidence_bound)

        return context, context_resized, np.linalg.inv(self.feature_covariance_matrix), self.estimated_weight, expected_reward, confidence_bound, upper_confidence_bound 
        # return upper_confidence_bound
        # np.linalg.inv(linUCB.feature_covariance_matrix)

    def logging(context, bandit_choice, noise):
        # Noise (Gaussian)
        noise = np.random.normal(0, 0.1)  

        # Compute reward
        reward = np.dot(context, self.true_weight) + noise

        self.number_pulled += 1

        # REGRET
        result_history.append(reward)

        return context, reward


## Create Linear UCB Bandit Agent 

Create an empty list, and then loop to the number of action and append it to the list. The number of action could represent the number of product in a recommendation.

In [41]:
# Create a list to store our bandit
bandit_list = []

# Append n amount of bandits
for i in range(NUM_OF_ACTION):

    bandit_list.append(LinUCB_Bandit())

# Check whether the bandit class is created correctly
if len(bandit_list) == NUM_OF_ACTION:
    print('Bandit successfully created!')

Bandit successfully created!


## Debug (Check details of the created bandit, this shouldn't be run in real scenario)

In [42]:
for bandit in bandit_list:
    # DEBUG
    print(bandit.true_weight)
    print(bandit.feature_covariance_matrix)
    print(bandit.weighted_reward_matrix)

[-0.36484893 -0.35093028 -1.33302671  0.74494551  0.45361073]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[-0.18925154 -1.3298439  -0.79831899  1.35906858 -0.4382645 ]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[-0.03380492  0.07136319 -1.85953795  0.88013191 -1.07628729]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[ 1.63402143  0.08696553  0.13650421 -0.5799358  -0.05185159]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[-1.05727349  0.95323234 -0.85610838 -1.69826187  0.07903731]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[-0.78067437 -1.64404716  1.11199186 -0.93517228 -0.51306536]
[[1. 0. 0. 0. 0.]


## Example of Contextual Bandit iteration for estimation

### Step 1: Preparation

Let's generate a context randomly to specify current context. In real scenario, each index from the context array supposed to represent a feature from a recommendation (age, gender, income, etc.)

In [43]:
context = np.random.randn(NUM_OF_CONTEXT)

# For Hardcode (adjust with the number of context)
# context = [0.5, 1.2]

print(f'Generated context: {context}')

Generated context: [ 0.52750417 -1.44516739 -0.12291913 -0.81016292 -0.28087726]


### Step 2: Calculate UCB Value for each action

Based on the context, we are going to count the UCB_Value for each bandit. UCB value will balance between exploration and exploitation.

We start off by creating an empty array to store all calculated UCB. Then, we loop through all the bandits for the calculation.

In [44]:
bandit_ucb_value = []

for bandit in bandit_list:
        
    context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)

    # print(context)
    # print(context_resized)
    # print(inverted_feature_cov)
    # print(estimated_weight)
    # print(expected_reward)
    # print(confidence_bound)
    # print(upper_confidence_abound)

    bandit_ucb_value.append(upper_confidence_bound)

### Step 3: Pick the best UCB and Pull the arm

Let's see the UCB value and extract the index of the best UCB.

In [45]:
print(bandit_ucb_value)

bandit_best_ucb = np.argmax(bandit_ucb_value)
print(f'Index of bandit with the highest UCB is {bandit_best_ucb}')

[array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]]), array([[3.53108172]])]
Index of bandit with the highest UCB is 0


In the first iteration, we don't have any knowledge about the environment, so we will have the same UCB value for each bandit. Meaning if we take the highest UCB, we'll immediately get the first index which is '0'

In [46]:
context, reward = bandit_list[bandit_best_ucb].return_reward(context)
print(f'Earned Reward: {reward}')
# print()

Earned Reward: -0.3220181046061935


### Step 4: Update UCB to renew the knowledge and estimated weight

In [47]:
bandit_list[bandit_best_ucb].update_ucb_matrix(context, reward)

for band

In [52]:
for i, bandit in enumerate(bandit_list):
    print(f'Bandit {i} estimated weight:')
    print(bandit.estimated_weight)
    print()

Bandit 0 estimated weight:
[[-0.04125828]
 [ 0.11303251]
 [ 0.00961401]
 [ 0.06336619]
 [ 0.02196857]]

Bandit 1 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 2 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 3 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 4 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 5 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 6 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 7 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 8 estimated weight:
[0. 0. 0. 0. 0.]

Bandit 9 estimated weight:
[0. 0. 0. 0. 0.]



Notice that the bandit 0 estimated weight is now updated.

In [None]:
# Repeat with number of trials
for i in range(NUM_OF_TRIALS-1):    # -1 for our recent example trials

    # Step 1: Preparation
    context = np.random.randn(NUM_OF_CONTEXT)
    # print(f'Generated context: {context}')

    # Step 2: Calculate UCB Value of each arm
    bandit_ucb_value = []

    for bandit in bandit_list:
        
        context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)

        bandit_ucb_value.append(upper_confidence_bound)

    # Step 3: Take the highest UCB value and pull the arm
    bandit_best_ucb = np.argmax(bandit_ucb_value)
    # print(f'Bandit index to pull: {bandit_best_ucb}')

    context, reward = bandit_list[bandit_best_ucb].return_reward(context)
    # print(f'Earned Reward: {reward}')
    # print()

    # Step 4: Update the UCB Matrix of the pulled arm
    bandit_list[bandit_best_ucb].update_ucb_matrix(context, reward)

    # Ignored Step
    logging(context, bandit_choice=np.random.randint(0,10), )

# Output
print(f'finish simulating for {NUM_OF_TRIALS-1} iterations!')

finish simulating for 9999 iterations!


## Check result

Focus on the True weight and estimated weight

In [56]:
# Check the result after multiple iterations
# print(f'context: {context}')
print(f'resized: {context_resized}')

for i, bandit in enumerate(bandit_list):
    context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)
    print(f'============ Bandit-{i} ============== number of pull: {bandit.number_pulled}')
    # print(f'Inverted Feature Cov (Ab): {inverted_feature_cov}')
    print(f'True weight: {bandit.true_weight}')
    print(f'Estimated weight: {estimated_weight}')
    print(f'Expected Reward: {expected_reward}')
    # print(f'Confidence Bound: {confidence_bound}')
    # print(f'UCB Value: {upper_confidence_bound}')
    print()
    # print(f'feature Cov Matrix: {bandit.feature_covariance_matrix}')
    # print(f'weighted_reward: {bandit.weighted_reward_matrix}')
    # print(f'estimated weight: {bandit.estimated_weight}')
    # print()

resized: [[ 2.55686698 -1.16240465 -0.75582164  0.22185639  1.34489915]]
True weight: [-0.36484893 -0.35093028 -1.33302671  0.74494551  0.45361073]
Estimated weight: [[-0.36265786]
 [-0.36050419]
 [-1.32805709]
 [ 0.75319867]
 [ 0.45635988]]
Expected Reward: [[1.27641808]]

True weight: [-0.18925154 -1.3298439  -0.79831899  1.35906858 -0.4382645 ]
Estimated weight: [[-0.19073239]
 [-1.32689828]
 [-0.79559915]
 [ 1.35691109]
 [-0.43864687]]
Expected Reward: [[1.36715001]]

True weight: [-0.03380492  0.07136319 -1.85953795  0.88013191 -1.07628729]
Estimated weight: [[-0.03348485]
 [ 0.07144198]
 [-1.86073158]
 [ 0.87854324]
 [-1.07558632]]
Expected Reward: [[-0.01392431]]

True weight: [ 1.63402143  0.08696553  0.13650421 -0.5799358  -0.05185159]
Estimated weight: [[ 1.63144037]
 [ 0.08946864]
 [ 0.13327225]
 [-0.57974852]
 [-0.05993312]]
Expected Reward: [[3.75742228]]

True weight: [-1.05727349  0.95323234 -0.85610838 -1.69826187  0.07903731]
Estimated weight: [[-1.05949921]
 [ 0.94957

The true weight and the estimated rewards are very closed meaning that our estimation is working!

We can also calculate the expected reward based on current context!

Notes from ChatGPT:
1. Check whether the self.feature_covariance_matrix update is correct

Instead of using np.matmul like this
`self.feature_covariance_matrix += np.matmul(context_resized.transpose(), context_resized)`

Use this
`self.feature_covariance_matrix += np.outer(context, context)`

2. Fix UCB Calculation
from:
`confidence_bound = np.sqrt(np.matmul(np.matmul(context_resized, inverted_feature_cov), context_resized.transpose()))`

to:
`confidence_bound = np.sqrt(np.dot(context, np.dot(inverted_feature_cov, context)))`


The suggestion from ChatGPT might be wrong, since the code I created is able to estimate the context true weight quite close.

In [None]:
def return_reward(self, context = np.random.rand(NUM_OF_CONTEXT)):

        # True weight vector for a particular arm (unknown in real applications)
        # theta_a = np.array([0.5, -0.2, 0.8, 0.3, -0.5])

        # Noise (Gaussian)
        noise = np.random.normal(0, 0.1)  

        # Compute reward
        reward = np.dot(context, self.true_weight) + noise


        # REGRET
        result_history.append(reward)

        return context, reward

# Random exploration for metric purposes
for i in range(NUM_OF_TRIALS):
    random_index = np.random.randint(0,10)

    

In [None]:
# LinUCB Result
y1 = np.cumsum(result_history)

# Best possible reward
best_mean = np.argmax([bandit.mean for bandit in list_bandit])
y2 = [i*best_mean for i in range(0,current_round-1)]

# Simulated 100% Explore
y3 = np.cumsum(explore_results)

# Generate x values
x = range(len(y1))

# Plot the lines
plt.figure(figsize=(6, 4))
plt.plot(x, y1, label='Earned Reward', color='green', alpha=0.7)
plt.plot(x, y2, label='Maximum expected reward', color='grey', alpha=0.7)
plt.plot(x, y3, label='Random Explore', color='purple', alpha=0.7)

# Labels and title
plt.xlabel('Number of trials')
plt.ylabel('Rewards')
plt.title('Upper Confidence Bound 1 - Algorithm Performance')
plt.legend()
plt.grid()

# Show the plot
plt.show()

# Hidden code (useless)

In [88]:
# Pull the arm
context, reward = bandit_list[bandit_best_ucb].return_reward(context)
print(f'Generated context: {context}')
print(f'Earned: {reward}')

# Update the UCB Matrix of the pulled arm
bandit_list[bandit_best_ucb].update_ucb_matrix(context, reward)

Generated context: [0.5, 1.2]
Earned: -0.31201530096925756


(array([[1.25, 0.6 ],
        [0.6 , 2.44]]),
 array([[-0.15600765],
        [-0.37441836]]))

In [89]:
print(bandit_list[0].feature_covariance_matrix)
print(bandit_list[0].weighted_reward_matrix)


[[1.25 0.6 ]
 [0.6  2.44]]
[[-0.15600765]
 [-0.37441836]]


In [95]:
# DEBUGGG
print(f'context: {context}')
print(f'resized: {context_resized}')

for i, bandit in enumerate(bandit_list):
    context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = bandit.calculate_ucb_value(context=context)
    print(f'============ Bandit-{i} ============== number of pull: {bandit.number_pulled}')
    print(f'Inverted Feature Cov (Ab): {inverted_feature_cov}')
    print(f'True weight: {bandit.true_weight}')
    print(f'Estimated weight: {estimated_weight}')
    print(f'Expected Reward: {expected_reward}')
    print(f'Confidence Bound: {confidence_bound}')
    print(f'UCB Value: {upper_confidence_bound}')
    print()
    # print(f'feature Cov Matrix: {bandit.feature_covariance_matrix}')
    # print(f'weighted_reward: {bandit.weighted_reward_matrix}')
    # print(f'estimated weight: {bandit.estimated_weight}')
    # print()

context: [1.59441619 0.43739919]
resized: [[1.59441619 0.43739919]]
Inverted Feature Cov (Ab): [[2.18040897e-04 4.76941667e-05]
 [4.76941667e-05 5.51056539e-04]]
True weight: [-1.42916069  0.52952878]
Estimated weight: [[-1.42837013]
 [ 0.52861006]]
Expected Reward: [[-2.04620285]]
Confidence Bound: [[0.02694895]]
UCB Value: [[-1.99230495]]

Inverted Feature Cov (Ab): [[ 0.27158334 -0.09528139]
 [-0.09528139  0.14801412]]
True weight: [ 0.3577058  -0.04912457]
Estimated weight: [[0.22776101]
 [0.06170951]]
Expected Reward: [[0.39013754]]
Confidence Bound: [[0.76539474]]
UCB Value: [[1.92092702]]

Inverted Feature Cov (Ab): [[ 0.37793991 -0.11087077]
 [-0.11087077  0.18691238]]
True weight: [0.99212952 0.03608986]
Estimated weight: [[0.5557568 ]
 [0.10314583]]
Expected Reward: [[0.93122354]]
Confidence Bound: [[0.91755263]]
UCB Value: [[2.76632879]]

Inverted Feature Cov (Ab): [[0.00191356 0.00878636]
 [0.00878636 0.0951716 ]]
True weight: [ 1.76727543 -0.35764693]
Estimated weight: [[ 

In [26]:
# DEBUG
linUCB.update_ucb_matrix(context=[0.5,1.2], reward=[1])

AttributeError: 'list' object has no attribute 'update_ucb_matrix'

In [155]:
# Pull an arm
context,reward = linUCB.return_reward()
print(f'Context: {context}')
print(f'Reward: {reward}')

Context: [0.93135123 0.51710428]
Reward: -0.17244695800782972


In [9]:
# Update feature covariance and weight reward matrix
linUCB.update_ucb_matrix(context=context, reward=reward)

NameError: name 'reward' is not defined

In [10]:
# DEBUG
context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = linUCB.calculate_ucb_value(context=[1.0, 1.2])
print(context)
print(context_resized)
print(inverted_feature_cov)
print(estimated_weight)
print(expected_reward)
print(confidence_bound)
print(upper_confidence_bound)


[1.0, 1.2]
[[1.  1.2]]
[[ 0.9070632  -0.22304833]
 [-0.22304833  0.46468401]]
[[0.18587361]
 [0.44609665]]
[[0.72118959]]
[[1.02024124]]
[[2.76167208]]


In [11]:
# DEBUG
print(np.array([context]))
print(np.array([context]).transpose())

np.matmul(np.array([context]).transpose(), np.array([context]))
# print(np.array(np.newaxis, context).transpose())

[[1.  1.2]]
[[1. ]
 [1.2]]


array([[1.  , 1.2 ],
       [1.2 , 1.44]])

In [None]:
linUCB.feature_covariance_matrix = 

## Update Matrices

In [73]:
# For MAB it's
action_list = [] # Filled with the estimated reward

In [74]:
# For Contextual Bandit we're gonna have
# NxM matrix
# N = Number of context or the feature
# M = Number of actions can be taken (bandit)
action = np.array([[], []], np.int32)

In [89]:
sample_data = {'name': ['product A', 'product B', 'product C', 'product D'],
                'gender': pd.Series(['Male', 'Female', 'Male'], index=[0, 2, 3]),
                'location': pd.Series(['South', 'North', 'West', 'East'], index=[0, 1, 2, 3]),
               }

data = pd.DataFrame(data=sample_data, index=[0, 1, 2, 3])

# From
#         name	 gender	 location
# 0	 product A     Male	    South
# 1  product B	    NaN	    North
# 2	 product C	 Female	     West
# 3	 product D	   Male	     East

# To
#    Gender_Male  Gender_Female  Location_South  Location_North  Location_West  Location_East
# 0            1              0               1               0              0              0
# 1            0              0               0               1              0              0
# 2            0              1               0               0              1              0
# 3            1              0               0               0              0              1

In [90]:
data

Unnamed: 0,name,gender,location
0,product A,Male,South
1,product B,,North
2,product C,Female,West
3,product D,Male,East


In [91]:
# One hot encode the data

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_data = encoder.fit_transform(data[['gender', 'location']])
encoded_columns = encoder.get_feature_names_out(['gender', 'location'])

one_hot_data = pd.DataFrame(encoded_data, columns=encoded_columns)

In [83]:
data

Unnamed: 0,gender_Female,gender_Male,gender_nan,location_East,location_North,location_South,location_West
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
class contextual_bandits:
    def __init__(self, states, actions):

array([[1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.]])

In [None]:
data = [['tom', 10], ['nick', 15], ['juli', 14]]

df = pd.DataFrame(data, columns=[])

TypeError: object of type 'int' has no len()

In [29]:
print(data.shape)
print(data.columns)
print(data["gender"].unique())

# Loop through the number of entry in the data
for i in range(data.shape[0]):
    print(f"{i}2")

def one_hot_encode():
    return

(4, 3)
Index(['name', 'gender', 'location'], dtype='object')
['Male' nan 'Female']
02
12
22
32


In [130]:
A_cov = np.array([[0.0, 0.0],[0.0, 0.0]])
B_cov = np.array([[0.0, 0.0], [0.0, 0.0]])

In [131]:
# context = [0.5, 1.2]
# context = np.array([[0.5, 1.2]])
context = np.random.randn(2)

A_cov += np.matmul(context.transpose(), context)
B_cov += np.outer(context, context)


In [132]:

print(A_cov)
print(B_cov)

[[0.69287448 0.69287448]
 [0.69287448 0.69287448]]
[[0.06309534 0.19933923]
 [0.19933923 0.62977914]]
