In [5]:
import numpy as np
from build_data import build_data

class QLearning:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.2):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.q_table = np.zeros((num_states, num_actions))

    def discretize_state(self, state):
        # Discretize the continuous state into integer values
        discretized_state = tuple(int(np.clip(val * 100, 0, self.num_states - 1)) for val in state)
        return discretized_state

    def choose_action(self, state):
        state = self.discretize_state(state)
        if np.random.rand() < self.exploration_prob:
            return np.random.randint(self.num_actions)
        else:
            return np.argmax(self.q_table[state, :])

    def update_q_table(self, state, action, reward, next_state):
        state = self.discretize_state(state)
        next_state = self.discretize_state(next_state)

        best_next_action_value = np.max(self.q_table[next_state, :])
        td_error = reward + self.discount_factor * best_next_action_value - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * td_error


def distance(station1, station2):
    # Euclidean distance between two stations
    return ((station1.position[0] - station2.position[0]) ** 2 +
            (station1.position[1] - station2.position[1]) ** 2) ** 0.5

def get_cost(state, action, rl_model):
    # Calculate cost using the Q-value from the trained Q-learning model
    return -rl_model.q_table[state, action]

def train_rl_model(map, rl_model, num_episodes=1000):
    for _ in range(num_episodes):
        start_station = np.random.choice(list(map.values()))
        end_station = np.random.choice(list(map.values()))
        state = (start_station.position[0], start_station.position[1],
                 end_station.position[0], end_station.position[1])

        while start_station != end_station:
            action = rl_model.choose_action(state)

            if action == 0:  # move to a neighboring station
                next_station = np.random.choice(list(start_station.links))
                next_state = (next_station.position[0], next_station.position[1],
                              end_station.position[0], end_station.position[1])
                reward = -distance(start_station, next_station)
            else:  # stay at the current station
                next_state = state
                reward = 0

            rl_model.update_q_table(state, action, reward, next_state)
            state = next_state
            start_station = next_station

# Usage example:
stations, _ = build_data()

# Assuming you have the number of states and actions based on your representation
num_states = 4
num_actions = 2

# Initialize the Q-learning model
rl_model = QLearning(num_states, num_actions)

# Train the model
train_rl_model(stations, rl_model)

# Now you can use rl_model to estimate the cost in the A* algorithm


KeyboardInterrupt: 