In [3]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [4]:
df = pd.read_csv("C:\\Users\\Leo\\code\\ch04_LoRA\\data\\review_data.csv", encoding="cp949")

In [5]:
df

Unnamed: 0,text,labels
0,배우들 연기도 너무 좋았어요.,1
1,스토리가 탄탄하고 연출도 훌륭했어요.,1
2,정말 감동적인 영화였습니다. 눈물이 멈추질 않았어요.,1
3,끝까지 집중해서 봤습니다. 완전 추천해요!,1
4,감성과 메시지가 모두 살아있는 영화였어요.,1
5,오랜만에 이런 좋은 영화를 봐서 기분이 좋아요.,1
6,"음악, 연출, 연기 모두 완벽했어요.",1
7,시간 가는 줄 몰랐어요. 최고의 영화!,1
8,스토리 전개가 매끄럽고 감동적이었어요.,1
9,친구들에게 꼭 추천하고 싶은 영화예요.,1


In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0, stratify=df['labels'])
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [8]:
train_dataset

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 16
})

In [7]:
train_dataset[0]

{'text': '정말 감동적인 영화였습니다. 눈물이 멈추질 않았어요.', 'labels': 1, '__index_level_0__': 2}

In [9]:
model_name = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def preprocess(data):
    res =  tokenizer(data['text'], padding='max_length', truncation=True, max_length=64)
    return res

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [11]:
train_dataset[0]

{'text': '정말 감동적인 영화였습니다. 눈물이 멈추질 않았어요.',
 'labels': 1,
 '__index_level_0__': 2,
 'input_ids': [2,
  8050,
  13912,
  8097,
  9376,
  18624,
  17,
  15037,
  17696,
  4098,
  8841,
  8186,
  17,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [12]:
train_args = TrainingArguments(
    output_dir="./saved_models/basic_sentiment1", 
    eval_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_strategy="epoch",
    use_cpu= True)



In [13]:
def accuracy_score(pred):
    preds = np.argmax(pred.predictions, axis=1)
    acc = (preds == pred.label_ids).mean()
    return {'accuracy': acc}

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=accuracy_score,
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1765,0.397223,0.75
2,0.1832,0.599156,0.75
3,0.0651,0.27244,0.75


TrainOutput(global_step=12, training_loss=0.1416113649805387, metrics={'train_runtime': 12.269, 'train_samples_per_second': 3.912, 'train_steps_per_second': 0.978, 'total_flos': 1578666332160.0, 'train_loss': 0.1416113649805387, 'epoch': 3.0})

In [22]:
test_texts = ["이 제품 너무 좋아요!", "별로예요. 추천 안함."]
inputs = tokenizer(test_texts, padding=True, truncation=True, max_length=64, return_tensors="pt")
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    print("결과 :", predictions.tolist())  # 1은 긍정, 0은 부정
        

결과 : [0, 0]


In [18]:
vars(outputs)

{'loss': None,
 'logits': tensor([[ 0.1866, -0.3116],
         [ 0.8813, -0.9767]]),
 'hidden_states': None,
 'attentions': None}

In [19]:
trainer.save_model("./saved_models/basic_sentiment1")
tokenizer.save_pretrained("./saved_models/basic_sentiment1")  

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("./saved_models/basic_sentiment1")
tokenizer = AutoTokenizer.from_pretrained("./saved_models/basic_sentiment1")

test_texts = ["이 제품 너무 좋아요!", "별로예요. 추천 안함."]
inputs = tokenizer(test_texts, padding=True, truncation=True, max_length=64,return_tensors="pt")
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    
preds = torch.argmax(outputs.logits, dim=1)
print("결과 :", preds.tolist())  # 1은 긍정, 0은 부정

결과 : [0, 0]
