## 1. Install necessary Libraries

In [5]:
# ! pip install transformers
# ! pip install tiktoken matplotlib
# ! pip install einops transformers_stream_generator
# ! pip install accelerate
# ! pip install peft

##  2. All Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import sys
import os
# replace the below path with your local path where hugging face save Qwen VL libraries
sys.path.append(os.path.abspath("/root/.cache/huggingface/modules/transformers_modules/Qwen/Qwen-VL-Chat/37124e214e313d50125bad97b23270b64acb012c/"))
from qwen_generation_utils import make_context, decode_tokens

from peft import LoraConfig, get_peft_model, TaskType

from dataclasses import dataclass, field

##  3. PEFT and LoRA Config

In [2]:
lora_config = LoraConfig(
            r=8, # 64
            lora_alpha=4, #16
            target_modules= ["c_attn", "c_proj", "w1", "w2"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}"

## 3. Create a Policy Network Using Qwen-VL

In [3]:
class PPO_Network(nn.Module):
    # nature paper architecture
    
    def __init__(self, num_actions):
        super().__init__()
        
        cache_dir = "./model_cache"
        
        self.tokenizer =  AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True, cache_dir = cache_dir)
        
        self.model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True, cache_dir = cache_dir)
            
        print(print_number_of_trainable_model_parameters(self.model))
        self.model = get_peft_model(self.model, lora_config)
        print(print_number_of_trainable_model_parameters(self.model))
        
        
        self.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
        
        n_embd = self.model.lm_head.in_features
        
        # Value Head
        num_labels = 1
        
        self.value_head = nn.Sequential(
                nn.LayerNorm(n_embd),
                nn.GELU(),
                nn.Linear(n_embd, int(n_embd/2) ),
                nn.GELU(),
                nn.Linear(int(n_embd/2), num_labels),
            ).to(torch.bfloat16).to(self.model.device)
        

        
        self.logit_head = self.model.get_output_embeddings()
        self.softmax = nn.Softmax(dim=1)

    def query_to_input_ids(self,query, tokenizer):
        
        history = []

        stop_words_ids = []

       
        max_window_size = self.generation_config.max_window_size
        raw_text, context_tokens = make_context(
            tokenizer,
            query,
            history=history,
            system="You are a helpful assistant.",
            max_window_size=max_window_size,
            chat_format=self.generation_config.chat_format,
        )

        stop_words_ids.extend([[tokenizer.im_end_id], [tokenizer.im_start_id]])

        input_ids = torch.tensor([context_tokens]).to(torch.int32)
        
        return input_ids, raw_text, context_tokens, stop_words_ids
        
    def generate(self, input_ids, 
                 stop_words_ids,
                 return_dict_in_generate,
                **kwargs):
        
#         kwargs.pop('generation_config', None)
        
        return self.model.generate(input_ids, 
                                   stop_words_ids,
                                   return_dict_in_generate,
#                                    generation_config=self.generation_config,
                                   **kwargs)
        
    def forward(self, input_ids, attention_mask=None, only_forward=False):
        outputs = self.model(input_ids,
                             attention_mask=attention_mask,
                             output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        
        aggregated_embedding = last_hidden_state.mean(dim=1)
        
        lm_logits = self.logit_head(last_hidden_state)
        
        
        value = self.value_head(aggregated_embedding).squeeze(-1)
        
    
        policy = self.softmax(lm_logits)
        

        
        return policy, value, lm_logits

## 4. Wrap the Policy Network and make it PPO Agent

In [4]:
class PPO_Agent(nn.Module):
    
    def __init__(self, num_actions):
        super().__init__()
        
        self.network = PPO_Network(num_actions)
    
    def forward(self, x):
        # , attention_mask = torch.ones((1, x.shape[1]), device='cuda:0')
        policy, value, lm_logits = self.network(x)
        return policy, value, lm_logits

## 5. Creating dataset loader from Carla (Note: you need to connect to CARLA simulator to generate dataset)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Batch_DataSet(torch.utils.data.Dataset):

    def __init__(self, obs, actions, adv, v_t, old_action_prob):
        super().__init__()
        self.obs = obs
        self.actions = actions
        self.adv = adv
        self.v_t = v_t
        self.old_action_prob = old_action_prob
        
    def __len__(self):
        return self.obs.shape[0]
    
    def __getitem__(self, i):
        return self.obs[i],self.actions[i],self.adv[i],self.v_t[i],self.old_action_prob[i]

## 6. Generating Parallelenvs for having multiple runs and collecting more data in parallel

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda:0")
dtype = torch.bfloat16

class Logger:
    
    def __init__(self, filename):
        self.filename = filename
        
        f = open(f"{self.filename}.csv", "w")
        f.close()
        
    def log(self, msg):
        f = open(f"{self.filename}.csv", "a+")
        f.write(f"{msg}\n")
        f.close()

cur_step = 0          
class Env_Runner:
    
    def __init__(self, env, agent, logger_folder):
        super().__init__()
        
        self.env = env
        self.agent = agent
        
        self.logger = Logger(f'{logger_folder}/training_info')
        self.logger.log("training_step, return")

        # print(self.env.reset())
        
        _, self.ob = self.env.reset()

        
        
    def run(self, steps):
        
        global cur_step
        
        obs = []
        actions = []
        rewards = []
        dones = []
        values = []
        action_prob = []
        
        for step in range(steps):

            query = f'Given 4 consecutive frames in CARLA: <img>{self.ob[0]}</img>, Picture 2: <img>{self.ob[1]}</img>, Picture 3: <img>{self.ob[2]}</img>, Picture 4: <img>{self.ob[3]}</img>, Predict throttle(range 0 to 1), steer (range -1 to 1), brake (range 0 to 1) values in form a json output'
            input_ids, raw_text, context_tokens, stop_words_ids = self.agent.network.query_to_input_ids(query, agent.network.tokenizer)
            
            
            # self.ob = torch.tensor(self.ob).to(device).to(dtype).n
            with torch.no_grad():
                policy, value, logits = self.agent(input_ids.to(device))
            # print(policy)
            policy = policy.to(torch.float32)
            policy = policy / policy.sum()
            # print(policy)
            action = self.agent.select_action(policy.to(torch.float32).detach().cpu().numpy()[0])
            
            obs.append(input_ids.squeeze(0))
            actions.append(action)
            values.append(value.detach())
            action_prob.append(policy[0,action].detach())
            
            _, self.ob, r, done, info, additional_done = self.env.step(action)
                      
            if done: # real environment reset, other add_dones are for learning purposes
                _, self.ob = self.env.reset()
                if "return" in info:
                    self.logger.log(f'{cur_step+step},{info["return"]}')
            
            rewards.append(r)
            dones.append(done or additional_done)
            
        cur_step += steps
                                    
        return [obs, actions, rewards, dones, values, action_prob]

## 7. Computing Advantage estimates necessary for PPO training

In [10]:
import numpy as np
import argparse
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import random
import os

from torch.utils.data import DataLoader
from torch.utils.data import Dataset

device = torch.device("cuda:0")
dtype = torch.bfloat16

def compute_advantage_and_value_targets(rewards, values, dones, gamma, lam):
    
    advantage_values = []
    old_adv_t = torch.tensor(0.0).to(device).to(dtype)
    
    value_targets = []
    old_value_target = values[-1]
    
    for t in reversed(range(len(rewards)-1)):
        
        if dones[t]:
            old_adv_t = torch.tensor(0.0).to(device).to(dtype)
        
        # ADV
        delta_t = rewards[t] + (gamma*(values[t+1])*int(not dones[t+1])) - values[t]
        
        A_t = delta_t + gamma*lam*old_adv_t
        advantage_values.append(A_t[0])
        
        old_adv_t = delta_t + gamma*lam*old_adv_t
        
        # VALUE TARGET
        value_target = rewards[t] + gamma*old_value_target*int(not dones[t+1])
        value_targets.append(value_target[0])
        
        old_value_target = value_target
    
    advantage_values.reverse()
    value_targets.reverse()
    
    return advantage_values, value_targets

## 8. Setting up hyper-parameters

In [11]:
folder_name = time.asctime(time.gmtime()).replace(" ","_").replace(":","_")
os.mkdir(folder_name)

# save the hyperparameters in a file
f = open(f'{folder_name}/args.txt','w')
# for i in args.__dict__:
#     f.write(f'{i},{args.__dict__[i]}\n')
# f.close()

# arguments
env_name = 'Carla'
num_stacked_frames = 4
start_lr = 1e-4 
gamma = 0.99
lam = 0.95
minibatch_size = 1
T = 129
c1 = 1.0
c2 = 0.01
actors = 1
start_eps = 0.1
epochs = 3
total_steps = 10000000
save_model_steps = 1000000

num_actions = gym.make(env_name).env.action_space.n

agent = PPO_Agent(num_actions).to(device)
optimizer = optim.Adam(agent.parameters(), lr=start_lr)

env_runners = []
for actor in range(actors):

    raw_env = gym.make(env_name)
    env = Carla_Wrapper(raw_env, env_name, num_stacked_frames, use_add_done=True)
    
    env_runners.append(Env_Runner(env, agent, folder_name))

num_model_updates = 0

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

trainable model parameters: 9656935168
all model parameters: 9656935168
percentage of trainable model parameters: 100.00
trainable model parameters: 21676032
all model parameters: 9678611200
percentage of trainable model parameters: 0.22


## 9. Training Qwen-VL model using it as a policy network in Proximal Policy Optimization

In [None]:
start_time = time.time()
while cur_step < total_steps:
    # change lr and eps over time
    alpha = 1 - (cur_step / total_steps)
    current_lr = start_lr * alpha
    current_eps = start_eps * alpha
    
    #set lr
    for g in optimizer.param_groups:
        g['lr'] = current_lr

    try :

        # Explicitly delete the tensors
        if batch_obs is not None:
            del batch_obs
        if batch_actions is not None:
            del batch_actions
        if batch_adv is not None:
            del batch_adv
        if batch_v_t is not None:
            del batch_v_t
        if batch_old_action_prob is not None:
            del batch_old_action_prob
        
        # Clear memory cache
        torch.cuda.empty_cache()
    except :
        pass
    
    # get data
    batch_obs, batch_actions, batch_adv, batch_v_t, batch_old_action_prob = None, None, None, None, None

    
    
    for env_runner in env_runners:
        obs, actions, rewards, dones, values, old_action_prob = env_runner.run(T)
        # print("obs",obs)
        # print("actions",actions)
        # print("rewards",rewards)
        # print("dones",dones)
        # print("old_action_prob",old_action_prob)
        adv, v_t = compute_advantage_and_value_targets(rewards, values, dones, gamma, lam)
        # print(adv, v_t)
    
        # assemble data from the different runners 
        batch_obs = torch.stack(obs[:-1]) if batch_obs == None else torch.cat([batch_obs,torch.stack(obs[:-1])])
        batch_actions = np.stack(actions[:-1]) if batch_actions is None else np.concatenate([batch_actions,np.stack(actions[:-1])])
        batch_adv = torch.stack(adv) if batch_adv == None else torch.cat([batch_adv,torch.stack(adv)])
        batch_v_t = torch.stack(v_t) if batch_v_t == None else torch.cat([batch_v_t,torch.stack(v_t)]) 
        batch_old_action_prob = torch.stack(old_action_prob[:-1]) if batch_old_action_prob == None else torch.cat([batch_old_action_prob,torch.stack(old_action_prob[:-1])])
    
    # load into dataset/loader
    dataset = Batch_DataSet(batch_obs,batch_actions,batch_adv,batch_v_t,batch_old_action_prob)
    dataloader = DataLoader(dataset, batch_size=minibatch_size, num_workers=0, shuffle=True)

    for epoch in range(epochs):
             
        # sample minibatches
        for i, batch in enumerate(dataloader):
            optimizer.zero_grad()
            
            # if i >= 8:
            #     break
            
            # get data
            obs, actions, adv, v_target, old_action_prob = batch 
    
            # print(obs, adv.shape)
            
            # adv = adv.squeeze(1)
            # print("adv",adv)
            # normalize adv values
            # print(torch.mean(adv), torch.std(adv))
            # adv = ( adv - torch.mean(adv) ) / ( torch.std(adv) + 1e-1)
            # print("adv",adv)
    
            
            
            # get policy actions probs for prob ratio & value prediction
            policy, v, logits = agent(obs.to(device))
            # get the correct policy actions
            pi = policy[range(minibatch_size),actions.long()]
            # print("pi",pi)
            
            # probaility ratio r_t(theta)
            probability_ratio = pi / (old_action_prob + 1e-8)
            # print("probability",probability_ratio)
            
            # compute CPI
            CPI = probability_ratio * adv
            # compute clip*A_t
            clip = torch.clamp(probability_ratio,1-current_eps,1+current_eps) * adv     
            
            # policy loss | take minimum
            L_CLIP = torch.mean(torch.min(CPI, clip))
            
            # value loss | mse
            L_VF = torch.mean(torch.pow(v - v_target,2))
            
            # policy entropy loss 
            S = torch.mean( - torch.sum(policy * torch.log(policy + 1e-8),dim=1))
    
            loss = - L_CLIP + c1 * L_VF - c2 * S
            print(cur_step,"loss",loss.item())
            loss.backward()
            optimizer.step()

            num_model_updates += 1
                     
            # print time
            if cur_step%5000 < T*actors:
                end_time = time.time()
                print(f'*** total steps: {cur_step} | time(50K): {end_time - start_time} ***')
                start_time = time.time()
            
            # # save the network after some time
            # if runner.cur_step%save_model_steps < T*actors:
            #     torch.save(agent,f'{folder_name}/{env_name}-{runner.cur_step}.pt')

    # env.close()

    

  if not isinstance(terminated, (bool, np.bool8)):


129 loss -0.014458715915679932
129 loss 1.887658715248108
129 loss 0.2943359315395355
129 loss 0.013155924156308174
129 loss 1.0896799564361572
129 loss 1.02349853515625
129 loss 0.4081787168979645
129 loss 0.15238097310066223
129 loss 0.014452103525400162
129 loss 0.19776611030101776
129 loss 0.13263800740242004
129 loss 0.300332635641098
129 loss 0.3104700446128845
129 loss 0.09183349460363388
129 loss 0.01649780198931694
129 loss -0.019371073693037033
129 loss -0.025072749704122543
129 loss -0.058364350348711014
129 loss 0.00604248046875
129 loss 0.033913105726242065
129 loss 0.02454833686351776
129 loss 0.04424537718296051
129 loss 0.034984707832336426
129 loss 0.04547271877527237
129 loss -0.013701923191547394
129 loss -0.006305846385657787
129 loss -0.044036865234375
129 loss -0.04411010816693306
129 loss -0.02636115998029709
129 loss -0.02316274121403694
129 loss -0.05706787109375
129 loss -0.02850158140063286
129 loss -0.0032595116645097733
129 loss -0.04174904525279999
129 los