source code for reference:
https://medium.com/data-science-in-your-pocket/contextual-bandits-in-reinforcement-learning-explained-with-example-and-codes-3c707142437b

https://medium.com/data-science/an-overview-of-contextual-bandits-53ac3aa45034

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import copy 
from keras.callbacks import EarlyStopping

tf.config.run_functions_eagerly(True)

In [None]:
#states = random states generated for training, 
#total_states = possible states count
def ohe_generator(states,total_states):
    ohe = np.zeros((len(states),total_states))
    for index, array in enumerate(ohe):
        ohe[index][states[index]] = 1
    return ohe

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
class contextual_bandits:
    def __init__(self,states,actions):
        self.states = states
        self.actions = actions
    
    def reward(self,state,action):
        if (state*action)%2==1:
            return 0.5 + 0.05*((state+action)%10)+np.random.rand()*0.1
        else:
            return 0.9 - 0.1*((state+action)%10)+np.random.rand()*0.1
    
    def network(self):
        input_ = Input(shape=(self.states))
        dense1 = Dense(128,activation='relu')(input_)
        dropout1 = Dropout(0.1)(dense1)
        dense2 = Dense(64,activation='relu')(dropout1)
        dropout2 = Dropout(0.1)(dense2)
        dense3 = Dense(self.actions,activation='sigmoid')(dropout2)
        model = Model(input_,dense3)
        
        rms = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
        model.compile(loss="mean_absolute_error", optimizer=rms,metrics="mean_absolute_error")
        return model

In [None]:
batch_size = 128
states = 100
actions = 4

def training():
    cb = contextual_bandits(states,actions)
    model = cb.network()
    sample_states = np.random.choice(range(states),size=batch_size*100)
    state_ohe = ohe_generator(sample_states, states)
    actual_reward = [[cb.reward(x,y) for y in range(cb.actions)] for x in sample_states]
    actual_reward_matrix = np.zeros((len(state_ohe),cb.actions))
    for index,x in enumerate(actual_reward):
                    actual_reward_matrix[index]=np.array(x)
    model.fit(state_ohe,actual_reward_matrix,batch_size=batch_size,epochs=20) 
    return model

In [None]:
state_ohe = ohe_generator(np.array([x for x in range(100)]), states)
estimated_reward = model.predict(state_ohe)

print({x:np.argmax(y) for x,y in enumerate(estimated_reward)})

In [None]:
cb = contextual_bandits(100,4)
print('\nreward for state {}\n'.format(0))
for x in range(4):
    print(cb.reward(0,x))
    
print('\nreward for state {}\n'.format(93))
for x in range(4):
    print(cb.reward(93,x))

# Our own code

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

## Hyperparameters

In [126]:
# The number of action/bandit can be taken
# In recommender system's context, it's the number of product/product category that is being put on advertisement
NUM_OF_ACTION = 10

# The number of context affecting users to pick an action
# In recommender system's context, it can be age, budget, country, favorite_brand.
NUM_OF_CONTEXT = 5
NUM_OF_CONTEXT = 2

# Alpha
ALPHA = 2   # Exploration constant

## Create our LinUCB Model class

$$R_t = x_t^⊤ \theta_a + \epsilon$$

In [144]:
class LinUCB_Bandit:
    

    def __init__ (self):
        # True Weight
        self.true_weight = np.random.randn(NUM_OF_CONTEXT)   # Theta_a

        self.feature_covariance_matrix = np.identity(NUM_OF_CONTEXT) # Identity matrix with a dimension of the number of action
        self.weighted_reward_matrix = np.zeros((NUM_OF_CONTEXT,1))

        self.estimated_weight = np.zeros(NUM_OF_CONTEXT)

    def return_reward(self):
        # Number of features
        d = NUM_OF_CONTEXT

        # Simulated context vector (randomly generated for each round)
        context = np.random.rand(d)

        # True weight vector for a particular arm (unknown in real applications)
        # theta_a = np.array([0.5, -0.2, 0.8, 0.3, -0.5])

        # Noise (Gaussian)
        noise = np.random.normal(0, 0.1)  

        # Compute reward
        reward = np.dot(context, self.true_weight) + noise

        return context, reward

    def update_ucb_matrix(self, context, reward):
        
        # Resize context array
        context_resized = np.array([context])

        # Count feature covarianve matrix (A_b)
        self.feature_covariance_matrix += np.matmul(context_resized.transpose(), context_resized)

        # Count weight sum reward (b_b)
        self.weighted_reward_matrix += reward * context_resized.transpose()

        # UCB_value = self.feature_covariance_matrix

        return self.feature_covariance_matrix, self.weighted_reward_matrix
    
    def calculate_ucb_value(self, context):

        # Resize context array
        context_resized = np.array([context])

        # Inverted feature covariance
        inverted_feature_cov = np.linalg.inv(self.feature_covariance_matrix)

        # Count estimated weight
        self.estimated_weight = np.matmul(inverted_feature_cov, self.weighted_reward_matrix)

        expected_reward = np.dot(context_resized, self.estimated_weight)

        confidence_bound = np.sqrt(np.matmul(np.matmul(context_resized, inverted_feature_cov), context_resized.transpose()))
        

        upper_confidence_bound = expected_reward + (ALPHA * confidence_bound)

        return context, context_resized, inverted_feature_cov, self.estimated_weight, expected_reward, confidence_bound, upper_confidence_bound 
        # return upper_confidence_bound
        # np.linalg.inv(linUCB.feature_covariance_matrix)


## Create Linear UCB Bandit Agent 

In [156]:
linUCB = LinUCB_Bandit()

In [157]:
# DEBUG
print(linUCB.true_weight)
print(linUCB.feature_covariance_matrix)
print(linUCB.weighted_reward_matrix)

[ 0.09871353 -1.11638224]
[[1. 0.]
 [0. 1.]]
[[0.]
 [0.]]


## Pick the next agent to pull by calculating UCB Value

To calculate UCB Value, we have to specify a current context the user have

In [159]:
context = [1.0, 1.2]

context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = linUCB.calculate_ucb_value(context=[1.0, 1.2])

print(context)
print(context_resized)
print(inverted_feature_cov)
print(estimated_weight)
print(expected_reward)
print(confidence_bound)
print(upper_confidence_bound)

[1.0, 1.2]
[[1.  1.2]]
[[ 0.9070632  -0.22304833]
 [-0.22304833  0.46468401]]
[[0.18587361]
 [0.44609665]]
[[0.72118959]]
[[1.02024124]]
[[2.76167208]]


In [158]:
# DEBUG
linUCB.update_ucb_matrix(context=[0.5,1.2], reward=[1])

(array([[1.25, 0.6 ],
        [0.6 , 2.44]]),
 array([[0.5],
        [1.2]]))

In [155]:
# Pull an arm
context,reward = linUCB.return_reward()
print(f'Context: {context}')
print(f'Reward: {reward}')

Context: [0.93135123 0.51710428]
Reward: -0.17244695800782972


In [150]:
# Update feature covariance and weight reward matrix
linUCB.update_ucb_matrix(context=context, reward=reward)

(array([[2.25, 1.8 ],
        [1.8 , 3.88]]),
 array([[0.72905086],
        [1.47486103]]))

In [125]:
# DEBUG
context, context_resized, inverted_feature_cov, estimated_weight, expected_reward, confidence_bound, upper_confidence_bound  = linUCB.calculate_ucb_value(context=[1.0, 1.2])
print(context)
print(context_resized)
print(inverted_feature_cov)
print(estimated_weight)
print(expected_reward)
print(confidence_bound)
print(upper_confidence_bound)


[1.0, 1.2]
[[1.  1.2]]
[[ 0.9070632  -0.22304833]
 [-0.22304833  0.46468401]]
[[0.18587361]
 [0.44609665]]
[[0.72118959]]
[[1.02024124]]
[[1.74143083]]


In [None]:
# DEBUG
print(np.array([context]))
print(np.array([context]).transpose())

np.matmul(np.array([context]).transpose(), np.array([context]))
# print(np.array(np.newaxis, context).transpose())

[[0.96440216 0.87245005]]
[[0.96440216]
 [0.87245005]]


array([[0.93007153, 0.84139272],
       [0.84139272, 0.7611691 ]])

In [None]:
linUCB.feature_covariance_matrix = 

## Update Matrices

In [73]:
# For MAB it's
action_list = [] # Filled with the estimated reward

In [74]:
# For Contextual Bandit we're gonna have
# NxM matrix
# N = Number of context or the feature
# M = Number of actions can be taken (bandit)
action = np.array([[], []], np.int32)

In [89]:
sample_data = {'name': ['product A', 'product B', 'product C', 'product D'],
                'gender': pd.Series(['Male', 'Female', 'Male'], index=[0, 2, 3]),
                'location': pd.Series(['South', 'North', 'West', 'East'], index=[0, 1, 2, 3]),
               }

data = pd.DataFrame(data=sample_data, index=[0, 1, 2, 3])

# From
#         name	 gender	 location
# 0	 product A     Male	    South
# 1  product B	    NaN	    North
# 2	 product C	 Female	     West
# 3	 product D	   Male	     East

# To
#    Gender_Male  Gender_Female  Location_South  Location_North  Location_West  Location_East
# 0            1              0               1               0              0              0
# 1            0              0               0               1              0              0
# 2            0              1               0               0              1              0
# 3            1              0               0               0              0              1

In [90]:
data

Unnamed: 0,name,gender,location
0,product A,Male,South
1,product B,,North
2,product C,Female,West
3,product D,Male,East


In [91]:
# One hot encode the data

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_data = encoder.fit_transform(data[['gender', 'location']])
encoded_columns = encoder.get_feature_names_out(['gender', 'location'])

one_hot_data = pd.DataFrame(encoded_data, columns=encoded_columns)

In [83]:
data

Unnamed: 0,gender_Female,gender_Male,gender_nan,location_East,location_North,location_South,location_West
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
class contextual_bandits:
    def __init__(self, states, actions):

array([[1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.]])

In [None]:
data = [['tom', 10], ['nick', 15], ['juli', 14]]

df = pd.DataFrame(data, columns=[])

TypeError: object of type 'int' has no len()

In [29]:
print(data.shape)
print(data.columns)
print(data["gender"].unique())

# Loop through the number of entry in the data
for i in range(data.shape[0]):
    print(f"{i}2")

def one_hot_encode():
    return

(4, 3)
Index(['name', 'gender', 'location'], dtype='object')
['Male' nan 'Female']
02
12
22
32
