In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/Hanso_deco/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [45]:
train_ft = pd.read_csv(f'{DATA_PATH}train_data.csv')
train_ft.shape

(6440, 2)

In [46]:
test_ft = pd.read_csv(f'{DATA_PATH}test.csv')
test_ft.shape

(130, 2)

In [47]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [48]:
# model_name = 'beomi/llama-2-ko-7b'
model_name = 'skt/kogpt2-base-v2'
model = AutoModelForCausalLM.from_pretrained(model_name)

In [49]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                              bos_token='</s>',  # start 토큰
                              eos_token='</s>',  # end 토큰
                              unk_token='<unk>',
                              pad_token='<pad>',
                              mask_token='<mask>',
                              max_len = 512)

In [50]:
class ChatDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.question = df["질문"].tolist()
        self.answer = df["답변"].tolist()

    def __len__(self):
        return len(self.question)
    def __getitem__(self, idx):
        return "<q>" + self.question[idx] + "</s><a>" + self.answer[idx] + "</s>"

In [51]:
class CollateFN:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, batch):
        x = self.tokenizer(batch, return_tensors="pt",padding=True)
        return {"x" : x}

In [52]:
collate_fn = CollateFN(tokenizer)

In [53]:
dt = ChatDataset(train_ft)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=True, collate_fn=collate_fn)
batch = next(iter(dl))
batch

{'x': {'input_ids': tensor([[ 9724,   455,   405,  8168,  7433,  8137, 43519,  9068, 11200,  8705,
          15333, 24488,  8084,   406,     1,  9724,   439,   405,  8168,  7433,
           8137, 10246, 17947, 11357,  9774,  9340, 11200,  8705, 12425, 32987,
          10635,   387,  9110,  7433, 15813, 19449, 14579, 10635, 11343, 16977,
          28112,  9855,  9133, 30470,  9199,  9677, 16691,  9394, 43519, 10679,
           9306, 10829, 12972, 29247, 37428,  9746, 10247,  6824, 11039, 12987,
          30470,  9199, 11350, 37194,  9625, 10679, 19466, 33204,  9359,  7991,
          11584, 42229,  6859,  9185,  9025,  6855,  9915, 15309, 10146, 18566,
          10599, 10586,  9597, 21154, 10783, 46394, 14653,  9185,  9110,  7433,
           8143, 41653, 27117, 14611, 13784, 41248, 49421,     1,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     

In [54]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train()
    for batch in tqdm(dataloader):
        x = batch["x"].to(device)
        pred = model(**x).logits # 예측값 batch, seq, n_class
        n_class = pred.shape[-1] # 정답 클래스 개수
        pred = pred[:,:-1].reshape(-1, n_class) # batch x seq, n_class

        trg = x["input_ids"][:,1:].flatten() # batch x seq

        mask = trg != 3
        trg = trg[mask]
        pred = pred[mask]

        loss = loss_fn(pred, trg)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

In [55]:
batch_size = 14
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 15

In [56]:
reset_seeds(SEED)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

train_dt = ChatDataset(train_ft)
train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

for i in range(epochs):
    loss = train_loop(train_dl, model, loss_fn, optimizer, device)
    print(i, "번째: ", loss)

OutOfMemoryError: CUDA out of memory. Tried to allocate 150.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 137.06 MiB is free. Process 8041 has 14.61 GiB memory in use. Of the allocated memory 14.01 GiB is allocated by PyTorch, and 477.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [19]:
def chatbot(model, tokenizer, max_len, device, test_ft):
    model.eval()
    results = []
    for text in '<q>' + test_ft['질문'] + '</s><a>':
        x = tokenizer.encode(text, return_tensors='pt').to(device)
        result = model.generate(x,
                                max_length=max_len,
                                use_cache=True,
                                repetition_penalty=4.8,
                                temperature = 0.1,
                                top_k = 40,
                                )
        q_len = len(text) + 1
        result = tokenizer.decode(result[0])
        results.append(result[q_len:-4])

    test_ft['답변'] = results
    return test_ft

In [27]:
result_df = chatbot(model, tokenizer, 256, device, test_ft)
result_df.shape



(130, 3)

In [38]:
result_df.to_csv('result_df.csv',index = False)