<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/Deepseek_r1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import deepspeed

# Define the grid environment
class GridEnvironment:
    def __init__(self, size=5):
        self.size = size
        self.state = (0, 0)
        self.goal = (size - 1, size - 1)

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0 and x > 0:  # Up
            x -= 1
        elif action == 1 and x < self.size - 1:  # Down
            x += 1
        elif action == 2 and y > 0:  # Left
            y -= 1
        elif action == 3 and y < self.size - 1:  # Right
            y += 1

        self.state = (x, y)
        reward = 1 if self.state == self.goal else -0.1
        done = self.state == self.goal
        return self.state, reward, done

# Define the DQN model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Hyperparameters
GRID_SIZE = 5
STATE_DIM = 2
ACTION_DIM = 4
EPISODES = 500
GAMMA = 0.99
EPSILON = 1.0
EPSILON_DECAY = 0.995
MIN_EPSILON = 0.1
BATCH_SIZE = 32
MEMORY_SIZE = 10000
LEARNING_RATE = 0.001

# Replay buffer
class ReplayBuffer:
    def __init__(self, size):
        self.buffer = []
        self.size = size

    def add(self, transition):
        if len(self.buffer) >= self.size:
            self.buffer.pop(0)
        self.buffer.append(transition)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[i] for i in indices]

# Initialize environment, model, and buffer
env = GridEnvironment(GRID_SIZE)
model = DQN(STATE_DIM, ACTION_DIM)
buffer = ReplayBuffer(MEMORY_SIZE)

# DeepSpeed configuration
deepspeed_config = {
    "train_batch_size": BATCH_SIZE,
    "gradient_accumulation_steps": 1,
    "fp16": {
        "enabled": True
    },
    "optimizer": {
        "type": "Adam",
        "params": {
            "lr": LEARNING_RATE
        }
    }
}

# Initialize DeepSpeed
model_engine, optimizer, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=deepspeed_config)

# Training loop
for episode in range(EPISODES):
    state = env.reset()
    state = torch.FloatTensor(state).unsqueeze(0).to(model_engine.device)
    total_reward = 0

    while True:
        if np.random.rand() < EPSILON:
            action = np.random.randint(ACTION_DIM)
        else:
            q_values = model_engine(state)
            action = torch.argmax(q_values).item()

        next_state, reward, done = env.step(action)
        next_state = torch.FloatTensor(next_state).unsqueeze(0).to(model_engine.device)
        buffer.add((state, action, reward, next_state, done))

        state = next_state
        total_reward += reward

        if len(buffer.buffer) >= BATCH_SIZE:
            batch = buffer.sample(BATCH_SIZE)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.cat(states).to(model_engine.device)
            actions = torch.LongTensor(actions).unsqueeze(1).to(model_engine.device)
            rewards = torch.FloatTensor(rewards).unsqueeze(1).to(model_engine.device)
            next_states = torch.cat(next_states).to(model_engine.device)
            dones = torch.FloatTensor(dones).unsqueeze(1).to(model_engine.device)

            q_values = model_engine(states).gather(1, actions)
            next_q_values = model_engine(next_states).max(1)[0].unsqueeze(1)
            target_q_values = rewards + GAMMA * next_q_values * (1 - dones)

            loss = nn.MSELoss()(q_values, target_q_values)
            model_engine.backward(loss)
            model_engine.step()

        if done:
            break

    EPSILON = max(EPSILON * EPSILON_DECAY, MIN_EPSILON)
    print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {EPSILON:.2f}")

print("Training complete!")


ModuleNotFoundError: No module named 'deepspeed'