## Reference
article: https://towardsdatascience.com/conditional-text-generation-by-fine-tuning-gpt-2-11c1a9fc639d \
colab: https://colab.research.google.com/drive/1vnpMoZoenRrWeaxMyfYK4DDbtlBu-M8V?usp=sharing#scrollTo=H1ag9Z0iZbzG 

In [13]:
import json,random

sample_file = './reviewSelected100.json'

#data importing and formating
def process_raw_data(data_in_weird_format_file):
    with open(data_in_weird_format_file,'r') as f:
        raw_data = f.read()
    data = raw_data.split('}\n')
    return [json.loads(x+'}') for x in data if x != '']  #load the json straight-awway



data = process_raw_data(sample_file)
print(f'Processed {len(data)} lines of data')


Processed 15300 lines of data


In [14]:
#IMPORTS AND HYPER PARAMS
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForPreTraining ,AutoConfig
from transformers import TrainingArguments,Trainer
import torch

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}

TRAIN_DEV_SPLIT = 0.8
UNFREEZE_LAST_N = 1          #the number of learnable layer
MAXLEN = 256                 # max length of the tensor 
EPOCHS = 4
TRAIN_BATCHSIZE = 8
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2


#wandb environment set up
%env WANDB_PROJECT=test_NLP_PROJ  

env: WANDB_PROJECT=test_NLP_PROJ


In [15]:
#Define the custom dataset
"""
This dataset will allow us to control how much input we want to give to the model, and what output text we will get
"""

class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):
        
        #initializing from list of json 
        rating, text = [], []
        for _json in data:
            rating.append(f'cool: {_json["cool"]}; funny: {_json["funny"]}; useful: {_json["useful"]}; stars: {_json["stars"]};')
            text.append(_json['text'])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.rating    = rating
        self.text      = text


    #possible to use for keywords prompt
    @staticmethod
    def join_keywords(keywords, randomize=True):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)

    def __len__(self):
        return len(self.text)

    
    def __getitem__(self, i):
        rating = self.rating[i]
        #kw = self.join_keywords(keywords, self.randomize)
        
        _input = SPECIAL_TOKENS['bos_token'] + \
                 rating + SPECIAL_TOKENS['sep_token'] + \
                self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(_input,                                   
                                   truncation=True, 
                                   max_length=256, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        #label and input_ids same will auto shift 1 for next word prediction task -- GPT-2 model design
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

    
#it's abit of a waste why i use this function    
    
def split_data(data, S=0.8):
    # Shuffle ids
    random.shuffle(data)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    return data[:train_size],data[train_size:]

In [16]:
train_data, val_data = split_data(data,TRAIN_DEV_SPLIT)

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 12,240 samples for training, and 3,060 samples for validation testing'

In [17]:
#Define our model

checkpoint = 'gpt2'   #choose this because it was causal LM and trained on next word predictor 

tokenizer = AutoTokenizer.from_pretrained(checkpoint) 
tokenizer.add_special_tokens(SPECIAL_TOKENS)

config = AutoConfig.from_pretrained(checkpoint, 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

model = AutoModelForPreTraining.from_pretrained(checkpoint, config=config)

model.resize_token_embeddings(len(tokenizer))

# - Freeze all layers except last n:

for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\hoang/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true

In [18]:
#Define the training loop

training_args = TrainingArguments(
    output_dir="./output/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    evaluation_strategy="epoch",
    no_cuda=True,                                            #disable this  if u want to use CUDA
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()
trainer.save_model()   

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 12240
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6120
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [11]:
#### Testing
rating = 'cool: 0; funny: 0; useful: 0; stars: 5.0;'

prompt = SPECIAL_TOKENS['bos_token'] + rating + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

model.eval();

# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=256,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)   
    print("{}: {}\n\n".format(i+1,  text))

1: cool: 0; funny: 0; useful: 0; stars: 5.0; •
•

1TheI'm the /My/ II, myAmisthemy MyOdiSiTilMimEIISinmDiamandAndWIIsitIShR meInr mOniondiaLerIImp sJedjKjW\sISimilhI \o \t and (or) "f"I amanI isAniStlApiCn $MyNelMicPlyHgwI iIeSiI eIIIIWeII II SII S -F3 H- E5G4 S6 D2S8 6S9 7B aI*a


2: cool: 0; funny: 0; useful: 0; stars: 5.0; reddit_ _
 / �• •

• // The world is and the universeis http://wwwandtheworldsmanlycom / www/aboutTheWorldSleepingPics - You're a self-proclaimed "true" true person, you are an atheist who has been yourself since day 1st of July,You have come to believe in God but also be afraidofYou were born on JanuaryYouYou


3: cool: 0; funny: 0; useful: 0; stars: 5.0;
 //////////////////////////////////////////////////////////////// /
 /////////////////////////////////// – The, theThe (the)Iand my I-iAnd aDot andaSillyMyPonyCookieB


4: cool: 0; funny: 0; useful: 0; stars: 5.0;
 // /r/, the #reddit_subreddits subreddit// A, a\sonethe \wTheWiccaDotcomiR TheCit TheS1AofThingGia