<a href="https://colab.research.google.com/github/Tdas-christ/Reinforcement_Learning/blob/main/2348569_RL_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

class NonStationaryBandit:
    def __init__(self, n_arms, window_size, epsilon=0.1):
        self.n_arms = n_arms
        self.window_size = window_size  # Sliding window for nonstationary environment
        self.epsilon = epsilon
        self.rewards_history = [[] for _ in range(n_arms)]  # Reward history for each arm
        self.arm_counts = [0] * n_arms  # Number of times each arm has been pulled

    def pull(self, arm, reward):
        """Update reward history and trim to sliding window size."""
        self.rewards_history[arm].append(reward)
        if len(self.rewards_history[arm]) > self.window_size:
            self.rewards_history[arm].pop(0)  # Remove oldest reward (sliding window)
        self.arm_counts[arm] += 1

    def choose_arm(self):
        """Choose an arm using ε-greedy strategy."""
        if random.random() < self.epsilon:
            # Exploration: choose a random arm
            return random.randint(0, self.n_arms - 1)
        else:
            # Exploitation: choose the arm with the highest average reward
            avg_rewards = [np.mean(history) if history else 0 for history in self.rewards_history]
            return np.argmax(avg_rewards)

    def simulate(self, reward_functions, n_rounds):
        """Simulate the bandit pulling arms with nonstationary rewards."""
        total_rewards = 0
        for _ in range(n_rounds):
            chosen_arm = self.choose_arm()
            # Get reward based on the provided reward function (which can change over time)
            reward = reward_functions[chosen_arm]()
            self.pull(chosen_arm, reward)
            total_rewards += reward
        return total_rewards

# Define nonstationary reward functions for each arm
def reward_arm_1():
    # Reward drifts over time for arm 1
    return np.random.normal(loc=np.sin(np.pi * np.random.random()), scale=1)

def reward_arm_2():
    # Reward for arm 2 changes with random fluctuations
    return np.random.normal(loc=np.cos(np.pi * np.random.random()), scale=1)

# Instantiate the nonstationary bandit with 2 arms and a sliding window size of 50
bandit = NonStationaryBandit(n_arms=2, window_size=50, epsilon=0.1)

# Define the reward functions for each arm (which can change over time)
reward_functions = [reward_arm_1, reward_arm_2]

# Simulate 1000 rounds
total_rewards = bandit.simulate(reward_functions, n_rounds=1000)

print("Total rewards after 1000 rounds:", total_rewards)



\\(3\mathrm{\, N\cdot s}\\)


\\(3\text{ \frac{m}{s}}\\)


\\(3\text{ m/s}\\)


\\(3\text{ N\cdot s}\\)


\\(3\mathrm{m/s}\\)

