<a href="https://colab.research.google.com/github/SrinathMLOps/Dissertation/blob/main/Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Necessary Library Imports

We need gymnasium library for building the PPO model. All the necessary libraries are imported below

In [1]:
!pip install stable_baselines3 gymnasium
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

Collecting stable_baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting gymnasium
  Downloading gymnasium-1.1.1-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable_baselines3)


In [2]:
# reading the dataset
df = pd.read_csv('roulette_spin_data_range1.csv')

In [3]:
# checking into first five rows of the dataset to check if the data has been loaded successfully
df.head()

Unnamed: 0,Spin,Number,Color
0,1,6,black
1,2,32,red
2,3,11,black
3,4,4,black
4,5,6,black


In above data, the column 'Color' is in text. we will convert it to equivalent numeric form. like Black will be converted to 0, red to 1 and green to 2.

In [4]:
# Define a function to convert color to a number
def color_to_number(color):
    if color == 'red':
        return 1
    elif color == 'black':
        return 0
    elif color == 'green':
        return 2
    else:
        return -1  # in case of unexpected value

In [5]:
# Convert color to numeric format
df['Color'] = df['Color'].apply(lambda x: color_to_number(x))

## Preparing Training and Testing Dataset

The total dataset will be divided into training and testing set for model training and evaluation respectively.


In [6]:
# divide the datset into trin and test set in ratio 7:3. 70% for training and remaining for testing
train_df, test_df = train_test_split(df, test_size=0.3, shuffle=False) #setting suffle false to kee the sequence intact

In [7]:
class RouletteFullEnv(gym.Env):
    def __init__(self, df: pd.DataFrame):
        super().__init__()

        self.numbers = list(range(37))  # Numbers 0-36 in European roulette

        # Define color groups
        self.reds = {
            1, 3, 5, 7, 9, 12, 14, 16, 18,
            19, 21, 23, 25, 27, 30, 32, 34, 36
        }
        self.blacks = {
            2, 4, 6, 8, 10, 11, 13, 15, 17,
            20, 22, 24, 26, 28, 29, 31, 33, 35
        }

        # Define action types
        self.action_types = {
            0: "straight",       # Bet on a specific number (0-36)
            1: "color_red",
            2: "color_black",
            3: "odd",
            4: "even",
            5: "low",            # 1-18
            6: "high",           # 19-36
            7: "dozen1",         # 1-12
            8: "dozen2",         # 13-24
            9: "dozen3",         # 25-36
            10: "column1",       # 1st column (1,4,7,...)
            11: "column2",       # 2nd column (2,5,8,...)
            12: "column3",       # 3rd column (3,6,9,...)
            13: "split",         # Placeholder: not implemented
            14: "street",        # Placeholder: not implemented
            15: "corner",        # Placeholder: not implemented
            16: "six_line",      # Placeholder: not implemented
            17: "trio_0_1_2",    # 0-1-2 trio
            18: "trio_0_2_3",    # 0-2-3 trio
            19: "basket",        # 0-1-2-3
            20: "color_green"    # ✅ Bet on green (number 0)
        }

        self.action_space = spaces.MultiDiscrete([len(self.action_types), 37])
        self.observation_space = spaces.Discrete(37)

        self.df = df.reset_index(drop=True)
        self.current_step = 0
        self.max_steps = len(self.df)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        obs = self.df.loc[self.current_step, "Number"]
        return obs, {}

    def step(self, action):
        bet_type_idx, bet_number = action
        bet_type = self.action_types[bet_type_idx]

        result = self.df.loc[self.current_step, "Number"]
        reward = self._compute_payout(bet_type, bet_number, result)

        self.current_step += 1
        terminated = self.current_step >= self.max_steps
        truncated = False

        obs = self.df.loc[self.current_step, "Number"] if not terminated else 0

        return obs, reward, terminated, truncated, {}

    def _compute_payout(self, bet_type, bet_number, result):
        if bet_type == "straight":
            return 35 if bet_number == result else -1
        elif bet_type == "color_red":
            return 1 if result in self.reds else -1
        elif bet_type == "color_black":
            return 1 if result in self.blacks else -1
        elif bet_type == "color_green":
            return 35 if result == 0 else -1
        elif bet_type == "odd":
            return 1 if result != 0 and result % 2 == 1 else -1
        elif bet_type == "even":
            return 1 if result != 0 and result % 2 == 0 else -1
        elif bet_type == "low":
            return 1 if 1 <= result <= 18 else -1
        elif bet_type == "high":
            return 1 if 19 <= result <= 36 else -1
        elif bet_type == "dozen1":
            return 2 if 1 <= result <= 12 else -1
        elif bet_type == "dozen2":
            return 2 if 13 <= result <= 24 else -1
        elif bet_type == "dozen3":
            return 2 if 25 <= result <= 36 else -1
        elif bet_type == "column1":
            return 2 if result in {1,4,7,10,13,16,19,22,25,28,31,34} else -1
        elif bet_type == "column2":
            return 2 if result in {2,5,8,11,14,17,20,23,26,29,32,35} else -1
        elif bet_type == "column3":
            return 2 if result in {3,6,9,12,15,18,21,24,27,30,33,36} else -1
        elif bet_type == "trio_0_1_2":
            return 11 if result in {0, 1, 2} else -1
        elif bet_type == "trio_0_2_3":
            return 11 if result in {0, 2, 3} else -1
        elif bet_type == "basket":
            return 8 if result in {0, 1, 2, 3} else -1
        else:
            # Unimplemented types like "split", "street", etc.
            return -1


In [8]:
# 1. Create environment for training
train_env = make_vec_env(lambda: RouletteFullEnv(train_df), n_envs=1)

In [9]:
# 2. Create PPO model
model = PPO("MlpPolicy", train_env, verbose=1)

Using cpu device


In [10]:
# Optional: Setup callback for eval during training
eval_env = RouletteFullEnv(test_df)
eval_callback = EvalCallback(eval_env, best_model_save_path="./ppo_roulette/",
                             log_path="./ppo_roulette/logs/", eval_freq=500,
                             deterministic=True, render=False)

In [11]:
# 3. Train the model
model.learn(total_timesteps=10000)

-----------------------------
| time/              |      |
|    fps             | 713  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 553         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007790075 |
|    clip_fraction        | 0.0281      |
|    clip_range           | 0.2         |
|    entropy_loss         | -6.65       |
|    explained_variance   | 0.000802    |
|    learning_rate        | 0.0003      |
|    loss                 | 12.2        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0336     |
|    value_loss           | 40.3        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7be720b2d550>

## PPO Training Summary

| Metric | Interpretation |
|--------|----------------|
| **Total Timesteps** | 10,240 steps across 5 iterations of PPO training |
| **FPS (Speed)** | Training was fast, ranging from 682 to 991 frames per second |
| **Policy Gradient Loss** | Stable and negative (-0.03 to -0.04), indicating effective gradient updates |
| **Entropy Loss** | Around -6.6 consistently, suggesting the agent maintained exploration |
| **Approx. KL Divergence** | Low values (~0.005–0.01), meaning stable policy updates without big shifts |
| **Clip Fraction** | Ranged from 1.8% to 7.6%, within expected range for PPO clipping behavior |
| **Explained Variance** | Near zero throughout training, indicating the value function struggled to predict rewards accurately (common in high-variance games like roulette) |
| **Value Loss** | Fluctuated (27–64), likely due to unpredictable nature of the roulette environment |

> Overall, training was stable and efficient. However, low explained variance suggests that predicting future rewards is difficult — which is expected in a highly stochastic environment like roulette. More data or reward shaping might help.


In [12]:
# Reload best model (optional)
# from stable_baselines3 import PPO
# model = PPO.load("./ppo_roulette/best_model")

# Evaluate on test set
env = RouletteFullEnv(test_df)
obs, _ = env.reset()

total_reward = 0
step_count = 0


while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = env.step(action)
    total_reward += reward
    step_count += 1
    if terminated or truncated:
        break

print(f"Total Reward on test set: {total_reward} in {step_count} spins.")


Total Reward on test set: 81087 in 30000 spins.


In [13]:
import numpy as np
from stable_baselines3 import PPO

def evaluate_agent(env, model, n_episodes=1):
    total_rewards = []

    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            episode_reward += reward

        total_rewards.append(episode_reward)

    avg_reward = np.mean(total_rewards)
    return avg_reward

def evaluate_random(env, n_episodes=1):
    total_rewards = []

    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0

        while not done:
            # Randomly choose action_type and bet number
            action_type = env.action_space[0].sample()
            bet_number = env.action_space[1].sample()
            action = (action_type, bet_number)

            obs, reward, done, truncated, _ = env.step(action)
            episode_reward += reward

        total_rewards.append(episode_reward)

    avg_reward = np.mean(total_rewards)
    return avg_reward




In [14]:
# # Create test environment using test_df
# test_env = RouletteFullEnv(test_df)

# # Load trained model (if already saved) OR use your trained PPO object directly
# # model = PPO.load("ppo_roulette")
# # OR use: model = trained_ppo_model
# trained_ppo_model = model
# # Evaluate PPO
# ppo_avg_reward = evaluate_agent(test_env, model=trained_ppo_model, n_episodes=10)

# Evaluate Random
test_env = RouletteFullEnv(test_df)  # reinitialize to avoid state issues
random_avg_reward = evaluate_random(test_env, n_episodes=10)

# Print comparison
# print(f"PPO Avg Reward on Test Set: {ppo_avg_reward:.2f}")
print(f"Random Avg Reward on Test Set: {random_avg_reward:.2f}")

# if ppo_avg_reward > random_avg_reward:
#     print("✅ PPO outperforms random betting!")
# else:
#     print("❌ PPO does not outperform random betting.")

Random Avg Reward on Test Set: -840.00
