In [22]:
import gym
from gym import spaces
import pandas as pd
from datetime import datetime, timedelta
import random
import numpy as np

gym.logger.set_level(40)

class ChargingEnv():
    """
    ### 动作空间

    动作是形状为`(1,)`的`ndarray`, 表示该小时内汽车充放电的功率, 正数代表充电, 复数代表放电.

    | 序号 | 动作     | 值 |
    |-----|----------|--------|
    | 0   | discharge| 20.0   |
    | 1   | stay     | 20.0   |
    | 2   | charge   | 20     |


    ### 观察空间

    观察是形状为`(2,)`的`ndarray`, 表示汽车的当前电量和当前电价。

    | 序号 | 观察                | 最小值 | 最大值 |
    |-----|--------------------|--------|--------|
    | 0   | SOC                | 0      | 77     |
    | 1   | e_price            | 30     | 120    |

    ### 奖励

    奖励函数定义为：

    *r = -(power * e_price)*

    ### 起始状态

    起始状态是SOC为77(即满电状态), 电价取决于开始时的电价数据。
    """
    def __init__(self, penalty_factor=0.1):
        #环境参数
        # 生成上班时间，范围在七点到九点
        self.start_time = self.generate_random_time(7, 9)
        # 生成下班时间，范围在四点到六点
        self.end_time = self.generate_random_time(16, 18)
        #计算出实际情况下的开始充电和停止充电时间(整点)
        self.real_start_time, self.real_end_time = self.calculate_real_time(self.start_time, self.end_time)
        self.battery_capacity = 77
        self.SOC = 77
        self.soc_min = 0.0
        self.soc_max = 77.0
        self.e_price_min = 0.0
        self.e_price_max = 200.0
        self.power_max = 20.0
        self.current_step = 0
        self.df_prices = pd.read_csv("GR-data-11-20.csv", header=None, names=["DateTime", "ElectricityPrice"])
        self.penalty_factor = penalty_factor

        print("length of prices", len(self.df_prices))

        # 观察空间和动作空间的定义
        # 定义观察空间
        self.observation_space = spaces.Box(low=np.array([self.soc_min, self.e_price_min]),
                                            high=np.array([self.soc_max, self.e_price_max]),
                                            dtype=np.float32)       

        # 定义动作空间
        self.action_space = spaces.Discrete(3)

    def action_sample():
        return random.randint(0, 2)
        
    def generate_random_time(self, start_hour, end_hour):
        hour = random.randint(start_hour, end_hour)
        minute = random.randint(0, 59)
        second = random.randint(0, 59)
        return datetime.now().replace(hour=hour, minute=minute, second=second)
    
    def calculate_real_time(self, start_time, end_time):
        real_start_time = (start_time + timedelta(hours = 1))
        real_start_time = real_start_time.replace(minute=0, second=0)
        real_end_time = (end_time - timedelta(hours = 0))
        real_end_time = real_end_time.replace(minute=0, second=0)
        return real_start_time, real_end_time
        
    def read_e_price(self,index):  
        # Ensure the index is within the range of the dataframe
        if 0 <= index < len(self.df_prices):
            one_price = self.df_prices["ElectricityPrice"].iloc[index] / 1000
            return one_price
        else:
            # Handle the case where the index is out of range
            print("Index out of range.", index)
            return None
        
    def step(self, action):
        """
        在环境中执行一步动作，并返回新的观察、奖励等信息。

        参数：
        - `power`：该小时内汽车充电的功率。

        返回：
        - `observation`：新的观察。
        - `reward`：当前步的奖励。
        - `done`：标志是否完成（截断剧集）。
        - `info`：其他信息（空字典）。
        """
        SOC, e_price = self.state  

        # 计算新的SOC
        newSOC = SOC + 20 * (action - 1)
        newSOC = np.clip(newSOC, self.soc_min, self.soc_max)
        self.SOC = newSOC

        # 计算成本，根据功率和电价
        costs = (newSOC - SOC) * e_price
        penalty = self.penalty_factor * min(0, 20 * (action-1)) * e_price
        costs -= penalty

        if (self.current_step + 1) % 24 == self.real_start_time.hour or (self.current_step + 1) % 24 == self.real_end_time.hour:
            if (self.SOC < 10):
                costs += 500
                print("SOC is less than 10%: ", self.SOC)
            
        #取出新的电价
        newe_price = self.read_e_price(self.current_step+1)
        newe_price = np.clip(newe_price, self.e_price_min, self.e_price_max)
        self.current_step += 1

        self.state = np.array([newSOC, newe_price])

        # 返回新的观察、奖励、是否完成、其他信息
        return self._get_obs(), -costs, False, False, {}
    
    def reset(self):
        """
        重置环境到初始状态。

        返回：
        - `observation`：初始观察。
        - `info`：空字典。
        """
        # 恢复起始状态
        # self.current_step = 0
        self.state = np.array([77, self.read_e_price(self.current_step)])
        
        # 返回初始观察和空字典
        return self._get_obs(), {}
    
    def _get_obs(self):
        """
        返回当前观察。

        返回：
        - `observation`：当前观察。
        """
        SOC, e_price = self.state
        return np.array([SOC,e_price], dtype=np.float32)

In [23]:
# 初始化
num_soc_bins = 77
num_e_price_bins = 200
max_iteration = 100  # 最大迭代次数
initial_learning_rate = 1.0  # 初始学习率
min_learning_rate = 0.005  # 最小学习率
max_step = 168  # 最大步数

# Q-learning参数
epsilon = 0.05  # epsilon-greedy策略中的探索率
gamma = 1.0  # 折扣因子

def observation_to_state(environment, observation):
    # 获取观察空间的最低值
    environment_low = np.array([environment.soc_min, environment.e_price_min], dtype=np.float32)
    # 获取观察空间的最高值
    environment_high = np.array([environment.soc_max, environment.e_price_max], dtype=np.float32)
    # 计算在每个维度上的离散步长
    environment_dx = (environment_high - environment_low) / number_states

    # observation[0]:SOC ;  observation[1]: e_price
    soc = int((observation[0] - environment_low[0])/environment_dx[0])
    e_price = int((observation[1] - environment_low[1])/environment_dx[1])
    # soc:SOC, e_price:e_price
    return soc, e_price


# 模拟一个回合的函数
def episode_simulation(environment, policy=None):
    observation, others = environment.reset()
    total_reward = 0
    step_count = 0
    for _ in range(max_step):
        if policy is None:
            action = environment.action_sample()
        else:
            soc, e_price = observation_to_state(environment, observation)
            action = policy[soc][e_price]
        observation, reward, done, trun, _ = environment.step(action)
        total_reward += gamma ** step_count * reward
        step_count += 1
        if done:
            break
    return total_reward

if __name__ == '__main__':
    environment = ChargingEnv()
    np.random.seed(0)

    # 创建Q表，并初始化为零
    # 3个动作: 0: 左推, 1: 不推, 2: 右推
    q_table = np.zeros((num_soc_bins, num_e_price_bins, 3))

    # 训练max_iteration次
    for i in range(max_iteration):
        observation, others = environment.reset()
        total_reward = 0
        eta = max(min_learning_rate, initial_learning_rate * (0.85 ** (i // 100)))
        for j in range(max_step):
            soc, e_price = observation_to_state(environment, observation)
            if np.random.uniform(0, 1) < epsilon:
                action = np.random.choice(environment.action_space.n)
            else:
                logits = q_table[soc][e_price]
                logits_exp = np.exp(logits)
                probabilities = logits_exp / np.sum(logits_exp)
                action = np.random.choice(environment.action_space.n, p=probabilities)
                observation, reward, done, trun, _ = environment.step(action)

            total_reward += reward
            soc_, e_price_ = observation_to_state(environment, observation)
            q_table[soc][e_price][action] = q_table[soc][e_price][action] + eta * (
                    reward + gamma * np.max(q_table[soc_][e_price_]) - q_table[soc][e_price][action])
            if done:
                break

    # 获取最优策略
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [episode_simulation(environment, solution_policy) for _ in range(100)]
    print("平均分数 : ", round(np.mean(solution_policy_scores), 2))

length of prices 44160
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
SOC is less than 10%:  0.0
平均分数 :  1.34
