In [1]:
import gym
from gym import spaces
import pandas as pd
from datetime import datetime, timedelta
import random
import numpy as np

gym.logger.set_level(40)

class ChargingEnv():
    """
    ### 动作空间

    动作是形状为`(1,)`的`ndarray`, 表示该小时内汽车充放电的功率, 正数代表充电, 复数代表放电.

    | 序号 | 动作     | 最小值 | 最大值 |
    |-----|----------|--------|--------|
    | 0   | power    | -20.0   | 20.0    |


    ### 观察空间

    观察是形状为`(2,)`的`ndarray`, 表示汽车的当前电量和当前电价。

    | 序号 | 观察                | 最小值 | 最大值 |
    |-----|--------------------|--------|--------|
    | 0   | SOC                | 0      | 77     |
    | 1   | e_price            | 30     | 120    |

    ### 奖励

    奖励函数定义为：

    *r = -(power * e_price)*

    ### 起始状态

    起始状态是SOC为77(即满电状态), 电价取决于开始时的电价数据。
    """
    def __init__(self, penalty_factor=0.1):
        #环境参数
        # 生成上班时间，范围在七点到九点
        self.start_time = self.generate_random_time(7, 9)
        # 生成下班时间，范围在四点到六点
        self.end_time = self.generate_random_time(16, 18)
        #计算出实际情况下的开始充电和停止充电时间(整点)
        self.real_start_time, self.real_end_time = self.calculate_real_time(self.start_time, self.end_time)
        self.battery_capacity = 77
        self.SOC = 77
        self.soc_min = 0.0
        self.soc_max = 77.0
        self.e_price_min = 0.0
        self.e_price_max = 200.0
        self.power_max = 20.0
        self.current_step = 0
        self.df_prices = pd.read_csv("GR-data-11-20.csv", header=None, names=["DateTime", "ElectricityPrice"])
        self.penalty_factor = penalty_factor

        print("length of prices", len(self.df_prices))

        # 观察空间和动作空间的定义
        # 定义观察空间
        self.observation_space = spaces.Box(low=np.array([self.soc_min, self.e_price_min]),
                                            high=np.array([self.soc_max, self.e_price_max]),
                                            dtype=np.float32)       

        # 定义动作空间
        self.action_space = spaces.Box(low=np.array([-self.power_max]),
                                       high=np.array([self.power_max]),
                                       dtype=np.float32)
        
    def generate_random_time(self, start_hour, end_hour):
        hour = random.randint(start_hour, end_hour)
        minute = random.randint(0, 59)
        second = random.randint(0, 59)
        return datetime.now().replace(hour=hour, minute=minute, second=second)
    
    def calculate_real_time(self, start_time, end_time):
        real_start_time = (start_time + timedelta(hours = 1))
        real_start_time = real_start_time.replace(minute=0, second=0)
        real_end_time = (end_time - timedelta(hours = 0))
        real_end_time = real_end_time.replace(minute=0, second=0)
        return real_start_time, real_end_time
        
    def read_e_price(self,index):  
        # Ensure the index is within the range of the dataframe
        if 0 <= index < len(self.df_prices):
            one_price = self.df_prices["ElectricityPrice"].iloc[index] / 1000
            return one_price
        else:
            # Handle the case where the index is out of range
            print("Index out of range.")
            return None
        
    def step(self, power):
        """
        在环境中执行一步动作，并返回新的观察、奖励等信息。

        参数：
        - `power`：该小时内汽车充电的功率。

        返回：
        - `observation`：新的观察。
        - `reward`：当前步的奖励。
        - `done`：标志是否完成（截断剧集）。
        - `info`：其他信息（空字典）。
        """
        SOC, e_price = self.state  # th := theta

        start_time = self.start_time
        end_time = self.end_time
        battery_capacity = self.battery_capacity

        # 对功率进行裁剪，确保在合理范围内
        power = np.clip(power, -self.power_max, self.power_max)[0]
        
        # 计算新的SOC
        newSOC = SOC + power
        newSOC = np.clip(newSOC, self.soc_min, self.soc_max)
        self.SOC = newSOC

        # 计算成本，根据功率和电价
        costs = (newSOC - SOC) * e_price
        penalty = self.penalty_factor * min(0, power) * e_price
        costs -= penalty

        if (self.current_step + 1) % 24 == self.real_start_time.hour or (self.current_step + 1) % 24 == self.real_end_time.hour:
            if (self.SOC < 10):
                costs += 500
                print("SOC is less than 10%: ", self.SOC)
            
        #取出新的电价
        newe_price = self.read_e_price(self.current_step+1)
        newe_price = np.clip(newe_price, self.e_price_min, self.e_price_max)
        self.current_step += 1

        self.state = np.array([newSOC, newe_price])

        # 返回新的观察、奖励、是否完成、其他信息
        return self._get_obs(), -costs, False, False, {}
    
    def reset(self):
        """
        重置环境到初始状态。

        返回：
        - `observation`：初始观察。
        - `info`：空字典。
        """
        # 恢复起始状态
        # self.current_step = 0
        self.state = np.array([77, self.read_e_price(self.current_step)])
        
        # 返回初始观察和空字典
        return self._get_obs(), {}
    
    def _get_obs(self):
        """
        返回当前观察。

        返回：
        - `observation`：当前观察。
        """
        SOC, e_price = self.state
        return np.array([SOC,e_price], dtype=np.float32)

In [3]:
# Agent

import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device type: ", device)

# Hyperparameters
LR_ACTOR = 1e-4
LR_CRITIC = 1e-3
GAMMA = 0.99
MEMORY_SIZE = 100000
BATCH_SIZE = 64
TAU = 5e-3

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x)) * 20
        return x
    
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
class ReplayMemory:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add_memo(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

class DDPGAgent:
    def __init__(self, state_dim, action_dim):
        self.actor = Actor(state_dim, action_dim).to(device)
        self.actor_target = Actor(state_dim, action_dim).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC)

        self.replay_buffer = ReplayMemory(MEMORY_SIZE)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.actor(state)
        return action.detach().cpu().numpy()[0]
    
    def update(self):
        if len(self.replay_buffer) < BATCH_SIZE:
            return
        
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(BATCH_SIZE)
        states = torch.FloatTensor(states).to(device)
        actions = torch.FloatTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).to(device)

        # Update critic
        next_actions = self.actor_target(next_states)
        target_Q = self.critic_target(next_states,
                                      next_actions.detach())  # .detach() means the gradient won't be backpropagated to the actor
        target_Q = rewards + (GAMMA * target_Q * (1 - dones))
        current_Q = self.critic(states, actions)
        critic_loss = nn.MSELoss()(current_Q, target_Q.detach())  # nn.MSELoss() means Mean Squared Error
        self.critic_optimizer.zero_grad()  # .zero_grad() clears old gradients from the last step
        critic_loss.backward()  # .backward() computes the derivative of the loss
        self.critic_optimizer.step()  # .step() is to update the parameters

        # Update actor 
        actor_loss = -self.critic(states, self.actor(states)).mean()  # .mean() is to calculate the mean of the tensor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks of critic and actor
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)
            
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)


  from .autonotebook import tqdm as notebook_tqdm


Device type:  cpu


In [2]:
# Train

import os

#initialize env
env = ChargingEnv()
STATE_DIM = env.observation_space.shape[0]
ACTION_DIM = env.action_space.shape[0]

agent = DDPGAgent(STATE_DIM, ACTION_DIM)

# Hypperparameters
NUM_EPISODE = 100
NUM_STEP = 168
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 8400

REWARD_BUFFER = np.empty(shape=NUM_EPISODE)

best_reward = float('-inf')  # 初始化最佳奖励为负无穷

for episode_i in range(NUM_EPISODE):
    state, others = env.reset()
    episode_reward = 0

    for step_i in range(NUM_STEP):
        epsilon = np.interp(episode_i * NUM_STEP + step_i, [0, EPSILON_DECAY],
                            [EPSILON_START, EPSILON_END])  # interpolation
        random_sample = random.random()
        if random_sample <= epsilon:
            action = np.random.uniform(low=-20, high=20, size=ACTION_DIM)
        else:
            action = agent.get_action(state)
        next_state, reward, done, truncation, info = env.step(action)
        agent.replay_buffer.add_memo(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        agent.update()
        if done:
            break
    REWARD_BUFFER[episode_i] = episode_reward

    # 保存最佳模型
    if episode_reward > best_reward:
        best_reward = episode_reward
        # 保存模型
        current_path = os.getcwd()
        model_path = current_path + '/models'
        os.makedirs(model_path, exist_ok=True)
        torch.save(agent.actor.state_dict(), model_path + "/best_actor_model.pth")
        torch.save(agent.critic.state_dict(), model_path + "/best_critic_model.pth")

    print(f"Episode: {episode_i + 1}, Reward: {round(episode_reward, 2)}, Best Reward: {round(best_reward, 2)}")

current_path = os.getcwd()

model_path = current_path + '/models'
os.makedirs(model_path, exist_ok=True)
torch.save(agent.actor.state_dict(), model_path + "/ddpg_actor.pth")
torch.save(agent.critic.state_dict(), model_path + "/ddpg_critic.pth")



length of prices 17664


NameError: name 'DDPGAgent' is not defined

In [4]:
# Test
import torch
import os
import torch.nn as nn
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device type: ", device)

# Initialize env
env = ChargingEnv()
STATE_DIM = env.observation_space.shape[0]
ACTION_DIM = env.action_space.shape[0]

# Load para
current_path = os.getcwd()
model = current_path + '/models/'
actor_path = model + "best_actor_model.pth"
# actor_path = model + "ddpg_actor.pth"

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x)) * 20
        return x
    
actor = Actor(STATE_DIM, ACTION_DIM).to(device)
actor.load_state_dict(torch.load(actor_path))

# Hyperparameters
NUM_EPISODE = 100
NUM_STEP = 168

# List to store episode rewards
episode_rewards = []

for episode_i in range(NUM_EPISODE):
    state, others = env.reset()
    episode_reward = 0

    for tep_i in range(NUM_STEP):
        action = actor(torch.FloatTensor(state).unsqueeze(0).to(device)).detach().cpu().numpy()[0]
        next_state, reward, done, truncation, info = env.step(action)
        state = next_state
        episode_reward += reward

    episode_rewards.append(episode_reward)
    print(f"Episode: {episode_i + 1}, Reward: {round(episode_reward, 2)}")

# Calculate average reward
average_reward = np.mean(episode_rewards)
print(f"Average Reward over {NUM_EPISODE} episodes: {round(average_reward, 2)}")


Device type:  cpu
length of prices 17664
Episode: 1, Reward: 2.88
Episode: 2, Reward: 1.38
Episode: 3, Reward: 2.14
Episode: 4, Reward: 1.51
Episode: 5, Reward: 2.36
Episode: 6, Reward: 2.13
Episode: 7, Reward: 1.83
Episode: 8, Reward: 1.9
Episode: 9, Reward: 1.33
Episode: 10, Reward: 1.79
Episode: 11, Reward: 2.6
Episode: 12, Reward: 1.86
Episode: 13, Reward: 1.64
Episode: 14, Reward: 1.91
Episode: 15, Reward: 1.26
Episode: 16, Reward: 1.81
Episode: 17, Reward: 2.21
Episode: 18, Reward: 1.99
Episode: 19, Reward: 2.04
Episode: 20, Reward: 2.47
Episode: 21, Reward: 2.63
Episode: 22, Reward: 3.14
Episode: 23, Reward: 2.17
Episode: 24, Reward: 2.91
Episode: 25, Reward: 3.38
Episode: 26, Reward: 2.84
Episode: 27, Reward: 2.56
Episode: 28, Reward: 1.71
Episode: 29, Reward: 2.58
Episode: 30, Reward: 1.59
Episode: 31, Reward: 2.55
Episode: 32, Reward: 2.43
Episode: 33, Reward: 2.23
Episode: 34, Reward: 2.94
Episode: 35, Reward: 1.93
Episode: 36, Reward: 2.17
Episode: 37, Reward: 1.65
Episode: