In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from fastai.text.all import *
import pandas as pd
import gc
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from IPython.display import clear_output
from transformers.utils import logging
logging.set_verbosity_error()
import warnings
warnings.filterwarnings(action="ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Ch·ªçn GPU n·∫øu c√≥

In [3]:
pretrained_weights = 'NlpHUST/gpt2-vietnamese'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights).to(device) 

In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
#Remember to use num_workers=0 when creating the DataBunch.

In [5]:
path = ''
df = pd.read_csv(r'shopee_data/new_shopee_1.csv')
df.columns

Index(['comment', 'father_name'], dtype='object')

In [6]:
sample_df = df.rename(columns={'father_name': 'category', 'comment': 'reviewText'})
sample_df["sentenceLength"] = sample_df["reviewText"].apply(lambda text: len(tokenizer.encode(text)))
sample_df = sample_df[['category', 'reviewText', 'sentenceLength']]
sample_df.head()

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

Unnamed: 0,category,reviewText,sentenceLength
0,Balo & T√∫i V√≠ Nam,Hskbiwnsbjsbsnnx ƒëj xbxjdnsnksksnsnjxjzbsjwmlanbnnnbbh j HD sjsnsndjsj,43
1,Balo & T√∫i V√≠ Nam,Ch·∫•t li·ªáu:ch·∫Øc ch·∫Øn V·∫£i\nM√†u s·∫Øc:ƒëen\nƒê√∫ng v·ªõi m√¥ t·∫£:ƒë√∫ng\n\nM√¨nh th√¥ng c·∫£m cho vi·ªác giao h√†ng l√¢u v√≠ c·∫ßm tay OK ƒë·∫πp l·∫Øm nhoa ü•π ƒë√∫ng v·ªõi h√¨nh ·∫£nh c·ªßa shop mong shop c·ªë g·∫Øng ch·ªó ƒëo·∫°n kh√¢u ch·ªâ v√≠ m√¨nh b·ªã bung m·ªôt √≠t nh∆∞ng ko sao ƒë·∫πp l·∫Øm nhoa khuy√™n n√™n mua nhoa gi√° ti·ªÅn h·ª£p l√≠ =]],79
2,Balo & T√∫i V√≠ Nam,T∆∞∆°ng t·ª± nh∆∞ ad: C√≥\nCh·∫•t l∆∞·ª£ng: t·ªët\nK√≠ch th∆∞·ªõc: t·ªët\n\nNh∆∞ng nh·ªØng g√¨ t√¥i mong ƒë·ª£i l√† n∆°i ƒë·ªÉ ƒë·∫∑t t√†i li·ªáu v√† n√≥ kh√¥ng c√≥!,35
3,Balo & T√∫i V√≠ Nam,ƒê√∫ng v·ªõi m√¥ t·∫£:s·∫£n ph·∫©m ƒë√∫ng nh∆∞ m√¥ t·∫£\nM√†u s·∫Øc:ƒë√∫ng m√†u ƒë√£ ƒë·∫∑t\nCh·∫•t li·ªáu:ok,24
4,Balo & T√∫i V√≠ Nam,"giao h√†ng nhanh m√¨nh ƒë·∫∑t 2 ng√†y l√† t·ªõi √πi\nƒë√≥ng g√≥i kƒ© c√†ng, t·∫∑ng sticker d·ªÖ th∆∞∆°ng\nbalo ch·ªëng n∆∞·ªõc, may ƒë·∫πp k c√≥ ch·ªâ th·ª´a",31


In [7]:
gc.collect()

20

In [8]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [None]:
learn = Learner(dls=None, model=model, loss_func=CrossEntropyLossFlat(), cbs=[DropOutput], metrics=Perplexity()).to_fp16()

In [11]:
model_finetuned_path = r"D:\DAP\code\gpt2-finetuned-shopee"

learn.load(model_finetuned_path).cuda()
learn.model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
N_INITIAL_WORDS = 5
LEN_SAMPLE_DF = sample_df.shape[0]
N_REVIEWS = 36000
BINS = list(range(5,300))

In [None]:
generated_df_rows = []

for i in tqdm(range(len(BINS) - 1)):
    clear_output(wait=True)  
    sample_row_df = sample_df[
        (sample_df["sentenceLength"] >= BINS[i]) & (sample_df["sentenceLength"] < BINS[i + 1])
    ]
    bin_proportion = sample_row_df.shape[0] / float(LEN_SAMPLE_DF)
    n_reviews_to_generate = int(bin_proportion * N_REVIEWS) + 1
    if len(sample_row_df) < 2:
        continue    
    try:
        or_df, cg_df = train_test_split(sample_row_df, test_size=0.5, random_state=2021)

        sample_reviews_df = cg_df.sample(n=min(n_reviews_to_generate, len(cg_df)), random_state=2021)
        or_sample_reviews_df = or_df.sample(n=min(n_reviews_to_generate, len(or_df)), random_state=2021)
        
        max_len = BINS[i]
    
        for index, row in sample_reviews_df.iterrows():
            text = row["reviewText"]
            prompt = " ".join(text.strip().split(" ")[:N_INITIAL_WORDS])
            prompt_ids = tokenizer.encode(prompt)
            # inp = tensor(prompt_ids)[None].cuda()
            inp = torch.tensor(prompt_ids, device=device)[None]  # Chuy·ªÉn tensor l√™n GPU

            random_seed(seed_value=2021, use_cuda=True)
            
            preds = learn.model.generate(inp, max_length=max_len + 1, do_sample=True, top_k=0, top_p=0.92, temperature=0.7)
            decoded_preds = tokenizer.decode(preds[0], skip_special_tokens=True)
            
            generated_df_rows.append([row["category"], decoded_preds, "CG"])
    
        for index, row in or_sample_reviews_df.iterrows():
            generated_df_rows.append([row["category"], row["reviewText"], "OR"])
            
    except Exception as e:
        print(e)
        continue

In [None]:
generated_df = pd.DataFrame(generated_df_rows, columns=["category", "text", "label"])

In [None]:
generated_df.shape

In [None]:
generated_df.sample(20)

In [None]:
generated_df.head()

In [None]:
generated_df.to_csv("Generated_Sentences_1.csv", index=None)