## 1. Import libraries, versions check and setting variables

In [None]:
import csv
import io
import json
import numpy as np
import os
import pandas as pd
import random
import requests
import spacy
import spacy_transformers
import sys
import time
import torch

from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
from numpy import NaN
from os.path import exists as file_exists
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, TrainingArguments, Trainer

from functions import *

In [None]:
print(f"Python version: {sys.version}")

print(f"PyTorch version: {torch.__version__}")

if torch.cuda.is_available():
    print("Cuda available")
    cuda_avbl = True
    spacy.require_gpu()
else:
    print("Cuda not available")
    cuda_avbl = False

In [None]:
LANGUAGE        = "German" #{English}                                                                                   # choose language 

if LANGUAGE     == "German":                                                                                            # set model by chosen language
    MODEL       = 'dbmdz/german-gpt2'
else:
    MODEL       = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

USE_WIKI        = False                                                                                                 # choose if wiki summary from keywords should used for model training

SPECIAL_TOKENS  = {"bos_token": "<|BOS|>",                                                                              # beginning of a sequenze
                   "eos_token": "<|EOS|>",                                                                              # end of a sequenze
                   "unk_token": "<|UNK|>",                                                                              # set for unknown tokens
                   "pad_token": "<|PAD|>",                                                                              # empty tokens for short sentences
                   "sep_token": "<|SEP|>"}                                                                              # seperates sentences
                    
MAXLEN          = 1024                                                                                                  # set max token count for gpt2

TRAIN_SIZE      = 0.8                                                                                                   # ration for splitting data into training and validation

UNFREEZE_LAST_N = 6                                                                                                     # the last N layers to unfreeze for training (6 -> half of all layers)                                                                                                        # set ration of training and test data

<br>

## 2. Loading and preporcessing Data

Load raw descriptions from Excel-file. <br>
Extract Keywords from description texts with KeyBert. <br>
Save all informations in dictionary. <br>
 <br>
(To save time, complete dictionary can load from json-file)

In [None]:
%%time

if not file_exists(""):                                                                                             # check if file with all needed data already exists
    df = get_raw_data("") #not included in appendix                                                                 # execute function for getting origin data
    dic = dict()
    for index, row in df.iterrows():
        dic[row["id"]] = [row[1], row[2]]                                                                           # create dic with id as key and productname and description as value
    data = create_keywords_bert(dic, cuda_avbl)                                                                     # execute function for extracting keywords
    
    with open("", "w") as output:
        json.dump(data, output)                                                                                     # save dict with all needed information as json

with open('') as input:
    data = json.load(input)                                                                                         # load all needed data as dict from json (saves time)

Only needed, if additional Wikipedia data should be used. <br>
<br>
All extracted keywords from descriptions are checked for a Wikipedia-page. <br>
If one exists, the word and the Wikipedia-summary will be saved as a new entry in the dictionary. <br>
The keyowrds are then extracted from the summaries too. <br>
<br>
(Complete item saved in json-file)

In [None]:
%%time
if USE_WIKI:
    if not file_exists(""):

        if not file_exists(""):
            import get_wiki_smry

        df_wiki = get_raw_data("")
        dic_wiki = dict()
        for index, row in df_wiki.iterrows():
            dic_wiki[row["id"]] = [row[1], row[2]]
        data_wiki = create_keywords_bert(dic_wiki, cuda_avbl)
        data.update(data_wiki)
          
        with open("", "w") as output:
            json.dump(data, output)

    with open('') as input:
        data = json.load(input)

<br>

## 3. Training Model

First, a basic pretrainded model is loaded.

In [None]:
%%time

tokenizer = get_tokenier(MODEL, special_tokens=SPECIAL_TOKENS)
model = get_model(MODEL, cuda_avbl, tokenizer, special_tokens=SPECIAL_TOKENS)

Define a class for trainings and test Dataset.

In [None]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):

        title, text, keywords = [], [], []
        for k, v in data.items():
            title.append(v[0])
            text.append(v[1])
            keywords.append(v[2])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.title     = title
        self.text      = text
        self.keywords  = keywords  


    @staticmethod
    def join_keywords(keywords, randomize=True):
        N = len(keywords)

        # random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)


    def __len__(self):
        return len(self.text)

    
    def __getitem__(self, i):
        keywords = self.keywords[i].copy()
        kw = self.join_keywords(keywords, self.randomize)
        
        input = SPECIAL_TOKENS["bos_token"] + self.title[i] + \
                SPECIAL_TOKENS["sep_token"] + kw + SPECIAL_TOKENS["sep_token"] + \
                self.text[i] + SPECIAL_TOKENS["eos_token"]

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict["input_ids"]
        attention_mask = encodings_dict["attention_mask"]
        
        return {"label": torch.tensor(input_ids),
                "input_ids": torch.tensor(input_ids), 
                "attention_mask": torch.tensor(attention_mask)}

Create train and validation dataset in predifined ratio with custom dataset class.

In [None]:
train_data, val_data = split_data(data, TRAIN_SIZE)                                                                 # excute function for splitting data by defined ration
train_dataset = myDataset(train_data, tokenizer)                                                                    # create data sets with created class
val_dataset = myDataset(val_data, tokenizer, randomize=False)

print(f"There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing")

Freeze n layers of the model, so they will not retrained with own data and custom configurations. <br>
This reduces training duration and retains general language information gained from pretraining.

In [None]:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    # only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

Set training configurations like epochs, batchsize or learing rate. <br>
Training arguments are passed to model trainer. <br>
<br>
!!! Runtime can last from several hours to days, depending on available hardware !!! <br>
If Cuda is available, duration can be reduced to under one hour.


In [None]:
%%time

os.environ["WANDB_DISABLED"] = "true"

# set arguments for training model
if USE_WIKI:                                                                                                        # set path to save model
    OUT_PATH    = "model_with_wiki/"
else:
    OUT_PATH    = "model/"
EPOCHS          = 4
TRAIN_BATCHSIZE = 4
BATCH_UPDATE    = 16
STRATEGY        = "epoch" #{"steps"}
if cuda_avbl:
    USE_FP16    = True
else:
    USE_FP16    = False
WARMUP_STEPS    = 1e2
LR              = 5e-4
EPS             = 1e-8


training_args = TrainingArguments(
    output_dir=OUT_PATH,
    num_train_epochs=EPOCHS,                                                                                        # number of training epochs
    per_device_train_batch_size=TRAIN_BATCHSIZE,                                                                    # batch size per GPU/CPU core
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,                                                                       # number of steps to accumulate the gradients
    evaluation_strategy=STRATEGY,                                                                                   # when model should be evaluated
    fp16=USE_FP16,                                                                                                  # using 16-bit precision training or not
    fp16_opt_level="O1",
    warmup_steps=WARMUP_STEPS,                                                                                      # steps from 0 to learing rate  
    learning_rate=LR,                                                                                               # step size at each iteration
    adam_epsilon=EPS,                                                                                               # threshold for adaptive learning rates against zero division problems
    weight_decay=0.01,                                                                                              # regularization parameter to shrink model weights
    disable_tqdm=False,                                                                                             # ensure the display of the progress bar while training
    save_strategy=STRATEGY,                                                                                         # when model should be saved     
    save_total_limit=1,                                                                                             # maximum number of saved models
    load_best_model_at_end=True,     
)


# define trainer with model, arguments, data, tokenizer
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()
trainer.save_model()

<br>

## 4. Generating Text

Loading custom trained model.

In [None]:
tokenizer = get_tokenier(MODEL, special_tokens=SPECIAL_TOKENS)
model = get_model(MODEL, cuda_avbl, tokenizer, special_tokens=SPECIAL_TOKENS, 
                load_model_path=r"model\pytorch_model.bin")

Set product title and additional keywords.

In [None]:
title = "L'Oréal Age Perfect Rosé Tagescreme"
keywords = ["Tagescreme", "Reinigung", "Gesicht"]
kw = myDataset.join_keywords(keywords, randomize=False)                                                             # function for random shuffling keywords

prompt = SPECIAL_TOKENS["bos_token"] + title + \
         SPECIAL_TOKENS["sep_token"] + kw + SPECIAL_TOKENS["sep_token"]                                             # start input with beginning token, then insert title and keywords, separated by token
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)                                                     # Matrix with encoded input
if cuda_avbl:
    device = torch.device("cuda")
    generated = generated.to(device)

model.eval()
print("Generator ready")

Generating multiple sample texts. <br>
Beam search: Selects multiple (=int) possible tokens instead of simply the "best". <br>
After generation more steps, it compares the different possible phrases. These can differ by length and tokens.

In [None]:
sample_outputs = model.generate(inputs=generated,
                                max_length=MAXLEN,                                                                  # max lenght of generated text 
                                min_length=100,                                                                      # min lenght of generated text
                                do_sample=True,                                                                     # sampling or always using word with highest probability
                                early_stopping=True,                                                                # stopping beamch search when num_beam sentences finished                                                     
                                num_beams=5,                                                                        # number of possible tokens that beam search selects        
                                temperature=0.9,                                                                    # scales probabilities for a more conservative (lower) or divers (higher) model
                                top_k=50,                                                                           # number of most propable tokens to keep                                
                                top_p=0.7,                                                                          # keeping only most propable tokens for generation 
                                repetition_penalty=5.0,                                                             # avoiding sentences that repeat themselves
                                num_return_sequences=2                                                              # number of returned descriptions
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title) + len(','.join(keywords))    
    print("{}: {}\n\n".format(i+1,  text[a:]))