In [None]:
# Install dependencies
!pip install textrl
!pip install transformers
!pip install gym==0.22

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import libraries
import logging
import sys
import pfrl
import torch

from textrl import TextRLEnv,TextRLActor
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelWithLMHead

logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')




In [None]:
# Pre-trained Model
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")  
model = AutoModelWithLMHead.from_pretrained("gpt2-large")
model.eval()
model.cuda()

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [None]:
# Sentiment Classifier for RL Reward
sentiment = pipeline('sentiment-analysis',model="cardiffnlp/twitter-roberta-base-sentiment",tokenizer="cardiffnlp/twitter-roberta-base-sentiment",device=0,return_all_scores=True)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [None]:
# Text Generation Reward, Inverse Perplexity + Sentiment Classifier
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.CRITICAL)

In [None]:
class MyRLEnv(TextRLEnv):
    def get_reward(self, input_text, predicted_list, finish): # predicted will be the list of predicted token
      reward = 0
      if finish:
        if 1 < len(predicted_list) < 50:
          predicted_text = tokenizer.convert_tokens_to_string(predicted_list)
          # inverse perplexity
          inputs = tokenizer(input_text+predicted_text,return_tensors='pt').to('cuda')
          reward += (1/(torch.exp(model(**inputs, labels=inputs["input_ids"]).loss).mean().item()))
          # sentiment classifier
          reward += sentiment(predicted_text)[0][0]['score']
      return reward

In [None]:
env = MyRLEnv(model, tokenizer, observation_input=observaton_list)
actor = TextRLActor(env,model,tokenizer)
agent = actor.agent_ppo(update_interval=10, minibatch_size=2000, epochs=20)

In [None]:
# Result
actor.predict('elon musk wants to buy twitter')

" for $100M, but he doesn't know how much he will get for free."

In [None]:
# Training The Model
pfrl.experiments.train_agent_with_evaluation(
    agent,
    env,
    steps=500,
    eval_n_steps=None,
    eval_n_episodes=1,       
    train_max_episode_len=100,  
    eval_interval=10,
    outdir='text_gen', 
)

(<pfrl.agents.ppo.PPO at 0x7f02a37b1850>,
 [{'average_value': 0.35306314,
   'average_entropy': 71307.92,
   'average_value_loss': 0.5147798976115883,
   'average_policy_loss': 0.0007381601726592635,
   'n_updates': 204,
   'explained_variance': -792.5462700702748,
   'eval_score': 0.32121001530070853},
  {'average_value': 0.35419896,
   'average_entropy': 71307.9,
   'average_value_loss': 0.5183060767315328,
   'average_policy_loss': 0.0007371426608369802,
   'n_updates': 205,
   'explained_variance': -1.1150793962482402,
   'eval_score': 0.3523232704871164},
  {'average_value': 0.35246137,
   'average_entropy': 71307.84,
   'average_value_loss': 0.521216240581125,
   'average_policy_loss': 0.0009359557649077033,
   'n_updates': 207,
   'explained_variance': -1128.2076307043005,
   'eval_score': 0.0},
  {'average_value': 0.3963704,
   'average_entropy': 71307.62,
   'average_value_loss': 0.5208337689004838,
   'average_policy_loss': 0.0010009418000481675,
   'n_updates': 217,
   'expl

In [None]:
# Output 1
actor.predict('Elon Musk tried to buy Twitter')

" for $1.25M, but the latter was withdrawn without compensation, because he refused to pay $1M of Tesla's $1B valuation. Tesla has also refused to pay $1M of Tesla's $1B valuation, because of the nature of the offer. Tesla has also refused to pay $1M of Tesla's $1B valuation, because of the nature of the offer. Tesla has also refused to pay $1M of Tesla's $1B valuation, because of"

In [None]:
# Output 2
actor.predict('Elon Musk  face deposition by Twitter lawyers ahead of trial')

', but he refused to defend the company, so he won�t defend me. If he does, he will defend me, but he wonv lose his case.'

In [None]:
# Output 3
actor.predict('Vistara is a joint venture of Tata Sons and Singapore Airlines')

', with the goal of maximizing the safety of the public while maximizing the value of the company.'