In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from fastai.text.all import *
import pandas as pd
import numpy as np
import torch

# Data

In [None]:
path_data = r'shopee_data\new_shopee_1.csv'
df = pd.read_csv(path_data)
df = df.drop(columns=['father_name']).rename(columns={'comment':'reviewText'})

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df, random_state=2021, test_size=0.1)

In [None]:
all_texts = np.concatenate([df_train["reviewText"].values, df_valid["reviewText"].values])

In [None]:
all_texts.shape

# Model

In [None]:
pretrained_weights = 'NlpHUST/gpt2-vietnamese'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

In [None]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [None]:
splits = [list(df_train.index), list(df_valid.index)]
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)

In [None]:
tls.train[0]

In [None]:
tls.valid[0]

In [None]:
tls.train.items[0]

In [None]:
tls.tfms(tls.train.items[0]).shape, tls.tfms(tls.valid.items[0]).shape

In [None]:
show_at(tls.train, 0)

In [None]:
show_at(tls.valid, 0)

In [None]:
bs,sl = 4,256
dls = tls.dataloaders(bs=bs, seq_len=sl)

In [None]:
dls.show_batch(max_n=2)

#### Fine-tuning the model

In [None]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [None]:
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), cbs=[DropOutput], metrics=Perplexity()).to_fp16()

In [None]:
learn.validate()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1, 1e-4)

# Test model

In [None]:
learn.save("gpt2-finetuned-shopee")

In [None]:
df_valid.head(1)

In [None]:
prompt = "Đặt 6 túi thì 1 túi 14 inch tốt, 5 túi kia cầm ọp ẹp, đáy thì chả có tí gì để gọi là chống sock k biết shop có gửi nhầm k nhưng thôi 23k thì cũng k nên đòi hỏi mua tạm và k có lần sau."

prompt_ids = tokenizer.encode(prompt)
inp = tensor(prompt_ids)[None].cuda()
inp.shape

In [None]:
preds = learn.model.generate(inp, max_length=100, num_beams=5, temperature=0.75, early_stopping=True, no_repeat_ngram_size=2)
tokenizer.decode(preds[0].cpu().numpy())