In [None]:
%%time
%%capture
!pip install transformers
!pip install wandb

CPU times: user 59.6 ms, sys: 31 ms, total: 90.7 ms
Wall time: 8.2 s


In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForPreTraining ,AutoConfig
from transformers import TrainingArguments,Trainer
import torch
import wandb
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi


Mon Sep 13 09:30:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.9.0+cu102


In [None]:
import json,random

sample_file = '/content/drive/MyDrive/Colab Notebook/reviewSelected100.json'

#data importing and formating
def process_raw_data(data_in_weird_format_file):
    with open(data_in_weird_format_file,'r',encoding='latin-1') as f:
        raw_data = f.read()
    data = raw_data.split('}\n')
    return [json.loads(x+'}') for x in data if x != '']  #load the json straight-awway



data = process_raw_data(sample_file)
print(f'Processed {len(data)} lines of data')

In [None]:
wandb.init(project='CE4045-Review-Generator',entity='groupx')

config = wandb.config

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}

config.TRAIN_DEV_SPLIT = 0.9
config.UNFREEZE_LAST_N = 7          #the number of learnable layer
config.MAXLEN = 256                 # max length of the tensor 
config.EPOCHS = 5
config.TRAIN_BATCHSIZE = 4
config.LR              = 5e-4
config.EPS             = 1e-8
config.WARMUP_STEPS    = 1e2

config.checkpoint = 'distilgpt2'   #choose this because it was causal LM and trained on next word predictor 

In [None]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):
        
        #initializing from list of json 
        rating, text = [], []
        for _json in data:
            rating.append(myDataset.rating_score_to_words(_json))
            text.append(_json['text'])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.rating    = rating
        self.text      = text

    @staticmethod
    def rating_score_to_words(_json):
        cool_map = {0:"boring",1:"so so",2:"cool",3:"very cool",4:"extremely cool"}
        cool_rating = cool_map.get(int(math.log(_json["cool"] + 1)),"cool")
        fun_map = {0:"not funny",1:"kind of funny",2:"funny",3:"very funny",4:"extremely fun"}
        fun_rating = fun_map.get(int(math.log(_json["funny"] + 1)),"funny")
        useful_map = {0:"not useful",1:"kind of useful",2:"useful",3:"very useful",4:"extremely useful"}
        useful_rating = useful_map.get(int(math.log(_json["useful"] + 1)),"useful")
        quality_map = {1:"horrible",2:"bad",3:"ok",4:"good",5:"excellent"}
        quality_rating = quality_map.get(_json["stars"],"ok")
        return ', '.join([cool_rating,fun_rating,useful_rating,quality_rating])
        
    #possible to use for keywords prompt
    @staticmethod
    def join_keywords(keywords, randomize=True):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ' '.join(keywords)

    def __len__(self):
        return len(self.text)

    
    def __getitem__(self, i):
        rating = self.rating[i]
        #kw = self.join_keywords(keywords, self.randomize)
        
        _input = SPECIAL_TOKENS['bos_token'] + \
                 rating + SPECIAL_TOKENS['sep_token'] + \
                self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(_input,                                   
                                   truncation=True, 
                                   max_length=256, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        #label and input_ids same will auto shift 1 for next word prediction task -- GPT-2 model design
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

    
#it's abit of a waste why i use this function    
    
def split_data(data, S=0.8):
    # Shuffle ids
    random.shuffle(data)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    return data[:train_size],data[train_size:]

In [None]:
#set up tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint) 
tokenizer.add_special_tokens(SPECIAL_TOKENS)

#set up data
train_data, val_data = split_data(data,config.TRAIN_DEV_SPLIT)

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer)

print(f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing')

#set up model
model_config = AutoConfig.from_pretrained(config.checkpoint, 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

model = AutoModelForPreTraining.from_pretrained(config.checkpoint, config=model_config)

model.resize_token_embeddings(len(tokenizer))

# - Freeze all layers except last n:

for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - config.UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

There are 13,770 samples for training, and 1,530 samples for validation testing


In [None]:
training_args = TrainingArguments(
    output_dir="./output/",
    num_train_epochs=config.EPOCHS,
    per_device_train_batch_size=config.TRAIN_BATCHSIZE,
    per_device_eval_batch_size=config.TRAIN_BATCHSIZE,
    evaluation_strategy="epoch",
    #no_cuda=False,                                            #disable this  if u want to use CUDA
    warmup_steps=config.WARMUP_STEPS,    
    learning_rate=config.LR,
    adam_epsilon=config.EPS,
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()
trainer.save_model()

torch.save(model.state_dict(),f'./{wandb.run.name}.pt')

***** Running training *****
  Num examples = 13770
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 17215
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,1.7291,1.719679
2,1.6052,1.697389
3,1.4956,1.700388
4,1.4261,1.717995
5,1.3837,1.740404


Saving model checkpoint to ./output/checkpoint-500
Configuration saved in ./output/checkpoint-500/config.json
Model weights saved in ./output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./output/checkpoint-1000
Configuration saved in ./output/checkpoint-1000/config.json
Model weights saved in ./output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./output/checkpoint-1500
Configuration saved in ./output/checkpoint-1500/config.json
Model weights saved in ./output/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1500/special_tokens_map.json
Sav

In [None]:


device = 'cuda'


def gen_suggestions(rating,model):
  prompt = SPECIAL_TOKENS['bos_token'] + rating + SPECIAL_TOKENS['sep_token']
          
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

  model.eval();

  # Top-p (nucleus) text generation (10 samples):
  sample_outputs = model.generate(generated, 
                                  do_sample=True,   
                                  min_length=50, 
                                  max_length=256,
                                  top_k=30,                                 
                                  top_p=0.7,        
                                  temperature=0.9,
                                  repetition_penalty=2.0,
                                  num_return_sequences=3
                                  )

  for i, sample_output in enumerate(sample_outputs):
      text = tokenizer.decode(sample_output, skip_special_tokens=True)   

      print("{}: {}\n\n".format(i+1,  text[len(rating):]))

In [None]:
gen_suggestions('boring, not funny, not useful, horrible',model)

1: I'm used to seeing the reviews but this place is terrible. The food here was so bad I went to a different location and ordered a chicken breast kabob plate and the chicken breast kabob came with white rice instead of salad (which they charge). When my husband received his plate it looked like he had an excess fatty liver inside which was completely unacceptable. My friend also ordered the "beef" and said that the waiter didn't even apologize for doing what we were told... I left him hungry and hungry and I will never go back.


2: I was told this place is closed. The girl that works there did my nails and she kept screaming at me while I waited for a few minutes, then when someone finally came in to fix it the manager said no one greeted him or let her know. She basically walked away and was not friendly enough!


3: I've been to a lot of dueling piano bars and this one was the most disappointing.  The bartenders were really rude - but it's still very loud with my obnoxious female b

In [None]:
gen_suggestions('cool, funny, useful, excellent',model)

1: We had an amazing experience with OnQ Property Management.  They are very friendly and professional.  I highly recommend them to anyone looking for a property management experience!  We were in the market at about 6:30pm on our second day of renting out home (which was later confirmed).
When we arrived they called me immediately and explained that it would be done within 45 minutes, so when there wasn't any waiting time or even more people came over after us until 2PM!! After confirming my receipt online please ask if you want another appointment ASAP.


2: We have been looking for a place to get good BBQ food and this is one of those places.
It's located in Richlane Mall on the corner from East Washington E & Hwy 7. It has several options:  Chicken Pho (sweet bean) with rice or salad along side, Beef Noodle Soup ($8). The staff are very friendly, which is nice because they do offer free hot dishes! We also got two types - Hot and Sour Pork Rice($12)! They were pretty quick but I ha

In [None]:
! rm -rf wandb