In [None]:
import pandas as pd
df=pd.read_csv('sudoku.csv')
df.head()

Unnamed: 0,quizzes,solutions
0,0043002090050090010700600430060020871900074000...,8643712593258497619712658434361925871986574322...
1,0401000501070039605200080000000000170009068008...,3461792581875239645296483719658324174729168358...
2,6001203840084590720000060050002640300700800069...,6951273841384596727248369158512647392739815469...
3,4972000001004000050000160986203000403009000000...,4972583161864397252537164986293815473759641828...
4,0059103080094030600275001000300002010008200070...,4659123781894735623275681497386452919548216372...


In [None]:
!pip install stable-baselines3



In [None]:
!pip install torch



In [None]:
# Install requirements
!pip install gymnasium stable-baselines3 pandas --quiet

# Imports
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
import torch

# Helper to convert string puzzle to 9x9 grid
def str_to_grid(s):
    return np.array([int(c) for c in s]).reshape((9, 9))

# Load dataset
df = pd.read_csv("/content/sudoku.csv")
puzzles = df["quizzes"].values
solutions = df["solutions"].values

# Define Sudoku Environment
class SudokuEnv(gym.Env):
    def __init__(self, puzzles, solutions):
        super(SudokuEnv, self).__init__()
        self.puzzles = puzzles
        self.solutions = solutions
        self.action_space = spaces.Discrete(81 * 9)
        self.observation_space = spaces.Box(low=0, high=9, shape=(9, 9), dtype=np.int32)
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.index = np.random.randint(0, len(self.puzzles))
        self.grid = str_to_grid(self.puzzles[self.index])
        self.solution = str_to_grid(self.solutions[self.index])
        return self.grid.copy(), {}

    def is_valid(self, row, col, num):
        if num in self.grid[row] or num in self.grid[:, col]: return False
        box = self.grid[row//3*3:row//3*3+3, col//3*3:col//3*3+3]
        return num not in box

    def get_valid_actions(self):
        mask = np.zeros(81 * 9, dtype=np.int32)
        for i in range(9):
            for j in range(9):
                if self.grid[i, j] == 0:
                    for num in range(1, 10):
                        if self.is_valid(i, j, num):
                            mask[(i * 9 + j) * 9 + (num - 1)] = 1
        return mask

    def step(self, action):
        cell = action // 9
        number = (action % 9) + 1
        row, col = divmod(cell, 9)
        reward = 0
        terminated = False
        truncated = False

        if self.grid[row, col] != 0 or not self.is_valid(row, col, number):
            reward = -1
        else:
            self.grid[row, col] = number
            reward = 1 if number == self.solution[row, col] else -0.5

        if np.array_equal(self.grid, self.solution):
            reward = 10
            terminated = True

        return self.grid.copy(), reward, terminated, truncated, {}

# Initialize environment
base_env = SudokuEnv(puzzles, solutions)
monitored_env = Monitor(base_env)
vec_env = DummyVecEnv([lambda: monitored_env])

# Auto-select CPU or GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Train DQN model
model = DQN("MlpPolicy", vec_env, verbose=1, device=device)
model.learn(total_timesteps=50000)  # You can lower this to 10000 if short on time

# Heuristic fallback if DQN fails
def heuristic_solve(grid):
    def is_valid(board, r, c, num):
        if num in board[r] or num in board[:, c]: return False
        if num in board[r//3*3:r//3*3+3, c//3*3:c//3*3+3]: return False
        return True

    def solve():
        for i in range(9):
            for j in range(9):
                if grid[i, j] == 0:
                    for n in range(1, 10):
                        if is_valid(grid, i, j, n):
                            grid[i, j] = n
                            if solve():
                                return True
                            grid[i, j] = 0
                    return False
        return True

    solve()
    return grid

# Custom solve function
def solve_sudoku_custom(puzzle_str):
    puzzle = str_to_grid(puzzle_str)
    base_env.grid = puzzle.copy()
    obs, _ = base_env.reset()
    success = False

    for _ in range(81):
        valid = base_env.get_valid_actions()
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # <<< CPU-safe tensor
        q_values = model.q_net(obs_tensor).detach().numpy()[0]
        masked_q = np.where(valid == 1, q_values, -np.inf)

        if np.all(masked_q == -np.inf):
            break

        action = int(np.argmax(masked_q))
        obs, reward, terminated, truncated, _ = base_env.step(action)
        if terminated:
            success = True
            break

    if success and np.all(base_env.grid != 0):
        print("\n✅ Solved using DQN:")
        print(base_env.grid)
    else:
        print("\n⚠ DQN failed. Solving with heuristic...")
        solved = heuristic_solve(puzzle.copy())
        print("✅ Solved with heuristic:")
        print(solved)

# Example puzzle
custom_puzzle = "003020600900305001001806400008102900700000008006708200002609500800203009005010300"
solve_sudoku_custom(custom_puzzle)


Using device: cuda
Using cuda device


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [None]:
!pip install stable-baselines3[extra]
!pip install wandb
!pip install stable-baselines3


In [None]:
!pip install stable-baselines3[extra]

In [None]:
!pip install stable-baselines3

In [None]:
# Imports
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
import torch
import random

# Helper to convert string puzzle to 9x9 grid
def str_to_grid(s):
    if isinstance(s, (np.ndarray, list)):
        s = s[0]
    return np.array(list(map(int, list(str(s))))).reshape((9, 9))

# Load dataset
df = pd.read_csv("/content/sudoku.csv")
puzzles = df["quizzes"].values
solutions = df["solutions"].values

# Define Sudoku Environment
class SudokuEnv(gym.Env):
    def __init__(self, puzzles, solutions):
        super(SudokuEnv, self).__init__()
        self.puzzles = puzzles
        self.solutions = solutions
        self.action_space = spaces.Discrete(81 * 9)
        self.observation_space = spaces.Box(low=0, high=9, shape=(9, 9), dtype=np.int32)
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.index = np.random.randint(0, len(self.puzzles))
        self.grid = str_to_grid(self.puzzles[self.index])
        self.solution = str_to_grid(self.solutions[self.index])
        return self.grid.copy(), {}

    def is_valid(self, row, col, num):
        if num in self.grid[row] or num in self.grid[:, col]: return False
        box = self.grid[row//3*3:row//3*3+3, col//3*3:col//3*3+3]
        return num not in box

    def get_valid_actions(self):
        mask = np.zeros(81 * 9, dtype=np.int32)
        for i in range(9):
            for j in range(9):
                if self.grid[i, j] == 0:
                    for num in range(1, 10):
                        if self.is_valid(i, j, num):
                            mask[(i * 9 + j) * 9 + (num - 1)] = 1
        return mask

    def step(self, action):
        cell = action // 9
        number = (action % 9) + 1
        row, col = divmod(cell, 9)
        reward = 0
        terminated = False
        truncated = False

        if self.grid[row, col] != 0 or not self.is_valid(row, col, number):
            reward = -1
        else:
            self.grid[row, col] = number
            reward = 1 if number == self.solution[row, col] else -0.5

        if np.array_equal(self.grid, self.solution):
            reward = 10
            terminated = True

        return self.grid.copy(), reward, terminated, truncated, {}

# Heuristic solver to provide fallback solutions
def heuristic_solve(grid):
    def is_valid(board, r, c, num):
        if num in board[r] or num in board[:, c]: return False
        if num in board[r//3*3:r//3*3+3, c//3*3:c//3*3+3]: return False
        return True

    def solve():
        for i in range(9):
            for j in range(9):
                if grid[i, j] == 0:
                    for n in range(1, 10):
                        if is_valid(grid, i, j, n):
                            grid[i, j] = n
                            if solve():
                                return True
                            grid[i, j] = 0
                    return False
        return True

    solve()
    return grid

# Function to store heuristic experience and improve DQN
def learn_from_heuristic(puzzle_str, heuristic_solution, rewards, model, env):
    grid = str_to_grid(puzzle_str)
    for idx, value in enumerate(heuristic_solution.flatten()):
        if grid.flatten()[idx] == 0:  # Only learn from filled cells
            action = idx * 9 + (value - 1)
            model.replay_buffer.add(grid.copy(), action, 1, grid.copy(), False)

# Custom solve function

def solve_sudoku_custom(puzzle_str, model, env, device):
    puzzle = str_to_grid(puzzle_str)
    env.grid = puzzle.copy()
    obs, _ = env.reset()
    success = False
    episode_rewards = []

    for _ in range(81):
        valid = env.get_valid_actions()
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)
        q_values = model.q_net(obs_tensor).detach().cpu().numpy()[0]
        masked_q = np.where(valid == 1, q_values, -np.inf)

        if np.all(masked_q == -np.inf):
            break

        action = int(np.argmax(masked_q))
        obs, reward, terminated, truncated, _ = env.step(action)
        episode_rewards.append(reward)

        if terminated:
            success = True
            break

    if not success:
        print("\n\u26A0 DQN failed. Solving with heuristic...")
        heuristic_solution = heuristic_solve(puzzle.copy())
        print("\u2705 Solved with heuristic: ")
        print(heuristic_solution)
        learn_from_heuristic(puzzle_str, heuristic_solution, episode_rewards, model, env)

    return success, puzzle

# Initialize environment
base_env = SudokuEnv(puzzles, solutions)
monitored_env = Monitor(base_env)
vec_env = DummyVecEnv([lambda: monitored_env])

# Device auto-selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")l

# Train DQN model
model = DQN("MlpPolicy", vec_env, verbose=1, device=device)
model.learn(total_timesteps=50000)

# Solve custom puzzle
easiest_puzzle = "534678912672195348198342567859761423426853791713924856961537284287419635345286170"
solve_sudoku_custom(custom_puzzle, model, base_env, device)


Using device: cuda
Using cuda device

⚠ DQN failed. Solving with heuristic...
✅ Solved with heuristic: 
[[5 3 4 6 7 8 9 1 2]
 [6 7 2 1 9 5 3 4 8]
 [1 9 8 3 4 2 5 6 7]
 [8 5 9 7 6 1 4 2 3]
 [4 2 6 8 5 3 7 9 1]
 [7 1 3 9 2 4 8 5 6]
 [9 6 1 5 3 7 2 8 4]
 [2 8 7 4 1 9 6 3 5]
 [3 4 5 2 8 6 1 7 9]]


TypeError: ReplayBuffer.add() missing 1 required positional argument: 'infos'