Q-Learning

In [None]:
import numpy as np

# Define the environment (grid world)
# 'S' denotes the start position
# 'G' denotes the goal position
# 'H' denotes a hazard (negative reward)
# '.' denotes a normal cell (no reward)
# The agent can move in four directions: up (0), down (1), left (2), right (3)

environment = np.array([
    ['S', '.', '.', '.'],
    ['.', 'H', '.', 'H'],
    ['.', '.', '.', 'H'],
    ['H', '.', '.', 'G']
])

# Define the rewards for each cell
# 'G' (goal) has a high positive reward
# 'H' (hazard) has a high negative reward
# '.' (normal cell) has a neutral reward

reward_map = {
    'S': 0,  # start
    'G': 10,  # goal
    'H': -10,  # hazard
    '.': 0  # normal cell
}

# Define Q-learning parameters
num_episodes = 1000
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1  # exploration-exploitation tradeoff parameter

# Initialize Q-table
num_states = np.prod(environment.shape)
num_actions = 4  # up, down, left, right
Q = np.zeros((num_states, num_actions))

# Helper function to convert state coordinates to index
def state_to_index(state):
    return state[0] * environment.shape[1] + state[1]

# Helper function to choose an action using epsilon-greedy policy
def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.randint(num_actions)  # explore
    else:
        return np.argmax(Q[state_to_index(state), :])  # exploit

# Q-learning algorithm
for episode in range(num_episodes):
    state = (0, 0)  # start at the top-left corner
    total_reward = 0

    while True:
        action = choose_action(state)
        next_state = None

        # Determine next state based on action
        if action == 0:  # up
            next_state = (max(state[0] - 1, 0), state[1])
        elif action == 1:  # down
            next_state = (min(state[0] + 1, environment.shape[0] - 1), state[1])
        elif action == 2:  # left
            next_state = (state[0], max(state[1] - 1, 0))
        elif action == 3:  # right
            next_state = (state[0], min(state[1] + 1, environment.shape[1] - 1))

        # Get reward for the next state
        reward = reward_map[environment[next_state]]

        # Update Q-value using Bellman equation
        current_index = state_to_index(state)
        next_index = state_to_index(next_state)
        Q[current_index, action] = (1 - learning_rate) * Q[current_index, action] \
                                   + learning_rate * (reward + discount_factor * np.max(Q[next_index, :]))

        # Move to the next state
        state = next_state
        total_reward += reward

        # Break if goal is reached
        if environment[next_state] == 'G':
            break

    # Print episode information
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

# Test the learned policy
state = (0, 0)
path = [state]

while True:
    action = np.argmax(Q[state_to_index(state), :])
    next_state = None

    if action == 0:
        next_state = (max(state[0] - 1, 0), state[1])
    elif action == 1:
        next_state = (min(state[0] + 1, environment.shape[0] - 1), state[1])
    elif action == 2:
        next_state = (state[0], max(state[1] - 1, 0))
    elif action == 3:
        next_state = (state[0], min(state[1] + 1, environment.shape[1] - 1))

    path.append(next_state)
    state = next_state

    if environment[next_state] == 'G':
        break

# Print the path taken by the agent
print("Path taken by the agent:")
for row in environment:
    print(' '.join(row))
print("Agent's path:")
for step in path:
    environment[step] = '*'
    for row in environment:
        print(' '.join(row))
    environment[step] = '.'


Episode 100/1000, Total Reward: -720
Episode 200/1000, Total Reward: -270
Episode 300/1000, Total Reward: 10
Episode 400/1000, Total Reward: 10
Episode 500/1000, Total Reward: 0
Episode 600/1000, Total Reward: 10
Episode 700/1000, Total Reward: 10
Episode 800/1000, Total Reward: 10
Episode 900/1000, Total Reward: 10
Episode 1000/1000, Total Reward: 10
Path taken by the agent:
S . . .
. H . H
. . . H
H . . G
Agent's path:
* . . .
. H . H
. . . H
H . . G
. * . .
. H . H
. . . H
H . . G
. . * .
. H . H
. . . H
H . . G
. . . .
. H * H
. . . H
H . . G
. . . .
. H . H
. . * H
H . . G
. . . .
. H . H
. . . H
H . * G
. . . .
. H . H
. . . H
H . . *


In [13]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Q-learning parameters
num_states = X_train_scaled.shape[0]  # Number of training instances
num_actions = len(np.unique(y_train))  # Number of unique classes in y_train

# Initialize Q-table
Q = np.zeros((num_states, num_actions))

# Q-learning hyperparameters
alpha = 0.5  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995  # Exploration decay rate
num_episodes = 1000

# Q-learning algorithm
for episode in range(num_episodes):
    state = np.random.randint(0, num_states)  # Start from a random state

    while True:
        # Choose action (class prediction) using epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = np.random.randint(0, num_actions)  # Explore
        else:
            action = np.argmax(Q[state])  # Exploit

        # Simulate taking the action and observe the next state and reward
        next_state = state
        reward = 1 if y_train[state] == action else -1

        # Update Q-value using the Bellman equation
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        state = next_state  # Move to the next state

        # Decay epsilon to reduce exploration over time
        epsilon *= epsilon_decay

        if np.random.rand() < 0.1:
            break  # Break episode with a small probability to explore new episodes

# Function to predict classes using the learned Q-table
def predict_classes(X, Q):
    y_pred = []
    for i in range(X.shape[0]):
        state = i % num_states  # Use modulo to handle cases where X is larger than Q-table
        action = np.argmax(Q[state])  # Choose the action with the highest Q-value
        y_pred.append(action)
    return np.array(y_pred)

# Make predictions on the test set using the learned Q-table
y_pred = predict_classes(X_test_scaled, Q)

# Evaluate accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using Q-learning:", accuracy)

Accuracy using Q-learning: 0.4


In [15]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a Support Vector Machine (SVM) classifier
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0
