### IMPORTS

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from collections import deque 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### ENVIRONMENT 

In [None]:
env_id = "CartPole-v1"
env = gym.make(env_id)
eval_env = gym.make(env_id)

stateSpaceSize = env.observation_space.shape[0]
actionSpaceSize = env.action_space.n

  deprecation(
  deprecation(


In [None]:
print("The State Space is: ", stateSpaceSize)
print("Sample observation", env.observation_space.sample())

The State Space is:  4
Sample observation [2.8456905e+00 1.9619315e+38 2.7711576e-01 3.2348458e+38]


In [None]:
print("The Action Space is: ", actionSpaceSize)
print("Action Space Sample", env.action_space.sample())

The Action Space is:  2
Action Space Sample 0


### REINFORCE ARCHITECTURE

In [None]:
class Policy(nn.Module):

  def __init__(self,stateSpaceSize,actionSpaceSize,h_size):
    super(Policy,self).__init__()

    self.fc1 = nn.Linear(stateSpaceSize,h_size)
    self.fc2 = nn.Linear(h_size,actionSpaceSize)

  def forward(self,x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return F.softmax(x,dim=1)

  def act(self,state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    #state = torch.FloatTensor(state).unsqueeze(0).to(device)
    probs = self.forward(state)
    # we ll make a probability distribution(catergorcial)
    m = Categorical(probs)
    action = m.sample()

    return action.item() , m.log_prob(action)


In [None]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        returns = deque(maxlen=max_t) 
        n_steps = len(rewards) 
        
        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*disc_return_t + rewards[t]   )    
            
        ## eps is the smallest representable float, which is 
        # added to the standard deviation of the returns to avoid numerical instabilities        
        eps = np.finfo(np.float32).eps.item()
        ## standardization of the returns is employed to make training more stable
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)
        
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()
        
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        
    return scores

### HYPERPARAMS

In [None]:
cartpole_hyperparameters = {
    "h_size": 16,
    "n_training_episodes": 500,
    "n_evaluation_episodes": 10,
    "max_t": 500,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": stateSpaceSize,
    "action_space": actionSpaceSize,
}

In [None]:
# Create policy and place it to the device
cartpole_policy = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])

In [None]:
scores = reinforce(cartpole_policy,
                   cartpole_optimizer,
                   cartpole_hyperparameters["n_training_episodes"], 
                   cartpole_hyperparameters["max_t"],
                   cartpole_hyperparameters["gamma"], 
                   50)

Episode 50	Average Score: 22.18
Episode 100	Average Score: 24.93
Episode 150	Average Score: 50.39
Episode 200	Average Score: 120.94
Episode 250	Average Score: 196.34
Episode 300	Average Score: 241.34
Episode 350	Average Score: 227.81
Episode 400	Average Score: 318.54
Episode 450	Average Score: 389.79
Episode 500	Average Score: 419.12


### Evaluating agent 

In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
  
  episode_rewards = []
  for episode in range(n_eval_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
    
    for step in range(max_steps):
      action, _ = policy.act(state)
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward
        
      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [None]:
evaluate_agent(eval_env, 
               cartpole_hyperparameters["max_t"], 
               cartpole_hyperparameters["n_evaluation_episodes"],
               cartpole_policy)

(500.0, 0.0)

### PUSH TO HUB

In [None]:
%%capture
!pip install huggingface_hub
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/qlan3/gym-games.git
!apt install python-opengl
!pip install imageio-ffmpeg
!apt install ffmpeg
!apt install xvfb
!pip install pyyaml==6.0 # avoid key error metadata
!pip install pyglet #
from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json
import gym_pygame
import imageio

In [None]:
def record_video(env, policy, out_directory, fps=30):
  images = []  
  done = False
  state = env.reset()
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    # Take the action (index) that have the maximum expected future reward given that state
    action, _ = policy.act(state)
    state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [None]:
import os
def package_to_hub(repo_id, 
                model,
                hyperparameters,
                eval_env,
                video_fps=30,
                local_repo_path="hub",
                commit_message="Push Reinforce agent to the Hub",
                token= None
                ):
  _, repo_name = repo_id.split("/")
  
  # Step 1: Clone or create the repo
  # Create the repo (or clone its content if it's nonempty)
  api = HfApi()
  
  repo_url = api.create_repo(
        repo_id=repo_id,
        token=token,
        private=False,
        exist_ok=True,)
  
  # Git pull
  repo_local_path = Path(local_repo_path) / repo_name
  repo = Repository(repo_local_path, clone_from=repo_url, use_auth_token=True)
  repo.git_pull()
  
  repo.lfs_track(["*.mp4"])

  # Step 1: Save the model
  torch.save(model, os.path.join(repo_local_path,"model.pt"))

  # Step 2: Save the hyperparameters to JSON
  with open(Path(repo_local_path) / "hyperparameters.json", "w") as outfile:
    json.dump(hyperparameters, outfile)
  
  # Step 2: Evaluate the model and build JSON
  mean_reward, std_reward = evaluate_agent(eval_env, 
                                           hyperparameters["max_t"],
                                           hyperparameters["n_evaluation_episodes"], 
                                           model)

  # First get datetime
  eval_datetime = datetime.datetime.now()
  eval_form_datetime = eval_datetime.isoformat()

  evaluate_data = {
        "env_id": hyperparameters["env_id"], 
        "mean_reward": mean_reward,
        "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
        "eval_datetime": eval_form_datetime,
  }
  # Write a JSON file
  with open(Path(repo_local_path) / "results.json", "w") as outfile:
      json.dump(evaluate_data, outfile)

  # Step 3: Create the model card
  # Env id
  env_name = hyperparameters["env_id"]
  
  metadata = {}
  metadata["tags"] = [
        env_name,
        "reinforce",
        "reinforcement-learning",
        "custom-implementation",
        "deep-rl-class"
    ]

  # Add metrics
  eval = metadata_eval_result(
      model_pretty_name=repo_name,
      task_pretty_name="reinforcement-learning",
      task_id="reinforcement-learning",
      metrics_pretty_name="mean_reward",
      metrics_id="mean_reward",
      metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
      dataset_pretty_name=env_name,
      dataset_id=env_name,
    )

  # Merges both dictionaries
  metadata = {**metadata, **eval}

  model_card = f"""
  # **Reinforce** Agent playing **{env_id}**
  This is a trained model of a **Reinforce** agent playing **{env_id}** .
  To learn to use this model and train yours check Unit 5 of the Deep Reinforcement Learning Class: https://github.com/huggingface/deep-rl-class/tree/main/unit5
  """

  readme_path = repo_local_path / "README.md"
  readme = ""
  if readme_path.exists():
      with readme_path.open("r", encoding="utf8") as f:
        readme = f.read()
  else:
    readme = model_card

  with readme_path.open("w", encoding="utf-8") as f:
    f.write(readme)

  # Save our metrics to Readme metadata
  metadata_save(readme_path, metadata)

  # Step 4: Record a video
  video_path =  repo_local_path / "replay.mp4"
  record_video(env, model, video_path, video_fps)
  
  # Push everything to hub
  print(f"Pushing repo {repo_name} to the Hugging Face Hub")
  repo.push_to_hub(commit_message=commit_message)

  print(f"Your model is pushed to the hub. You can view your model here: {repo_url}")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repo_id = "Vis03al/Reinforce-cartpole" 
package_to_hub(repo_id,
                cartpole_policy, # The model we want to save
                cartpole_hyperparameters, # Hyperparameters
                eval_env, # Evaluation environment
                video_fps=30,
                local_repo_path="hub",
                commit_message="Push Reinforce agent to the Hub",
                )

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Pushing repo Reinforce-cartpole to the Hugging Face Hub


Upload file replay.mp4:  80%|#######9  | 32.0k/40.1k [00:00<?, ?B/s]

Upload file model.pt: 100%|##########| 2.52k/2.52k [00:00<?, ?B/s]

remote: LFS file scan complete.        
To https://huggingface.co/Vis03al/Reinforce-cartpole
   ac87a3f..10256bc  main -> main



Your model is pushed to the hub. You can view your model here: https://huggingface.co/Vis03al/Reinforce-cartpole
