In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 27.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 55.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 63.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# 데이터 불러오기
data = pd.read_csv('/content/drive/MyDrive/공장/train_fff.csv')
# data = data.drop('Unnamed: 0', axis=1)
data = data.sample(frac=1)
data['label'].value_counts()

2    700
1    700
0    700
Name: label, dtype: int64

In [5]:
# data.loc[data['label']==0]

In [8]:
train_data, test_data = train_test_split(data, test_size=.2)

print('훈련용 리뷰 개수 :', len(train_data))
print('테스트용 리뷰 개수 :', len(test_data))

훈련용 리뷰 개수 : 1680
테스트용 리뷰 개수 : 420


---

In [7]:
# # GPU 환경 설정
# # assert tf.test.is_gpu_available() == True, 'GPU 설정을 확인하세요.'
# print(tf.config.list_physical_devices('GPU'))
# print(tf.config.list_logical_devices('GPU'))

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [10]:
def create_examples(df):
    example = []
    label = []
    for (i, row) in enumerate(df.values):
        example.append(row[0])
        label.append(row[1:])
    
    return example, label

In [11]:
train_example, train_label = create_examples(train_data)
test_example, test_label = create_examples(test_data)

In [12]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [13]:
max_seq_len = 256

In [14]:
train_X, train_y = convert_examples_to_features(train_example, train_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|          | 0/1680 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 1680/1680 [00:00<00:00, 4647.48it/s]


In [15]:
test_X, test_y = convert_examples_to_features(test_example, test_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 420/420 [00:00<00:00, 4625.32it/s]


In [16]:
model = TFBertModel.from_pretrained("bert-base-multilingual-cased", from_pt=True)

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [17]:
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

In [18]:
class TFBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(3,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='softmax',
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [19]:
model = TFBertForSequenceClassification("bert-base-multilingual-cased")
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
# loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer,  loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [20]:
model.fit(train_X, train_y, epochs=5, batch_size=16, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f21741e1220>

In [21]:
results = model.evaluate(test_X, test_y, batch_size=8)
print("test loss, test acc: ", results)

test loss, test acc:  [1.015960693359375, 0.6833333373069763]


In [22]:
model.predict(test_X)



array([[0.03202765, 0.02396674, 0.94400567],
       [0.01495154, 0.9783894 , 0.00665906],
       [0.00148419, 0.9972523 , 0.00126352],
       ...,
       [0.9237603 , 0.00544156, 0.0707982 ],
       [0.08900312, 0.0052427 , 0.9057542 ],
       [0.09128083, 0.01531983, 0.89339936]], dtype=float32)

In [24]:
test_set = pd.read_csv('/content/drive/MyDrive/공장/test_fff.csv')
# test_set = test_set[['reviews']]
# test_set['y'] = 0
test_set = test_set.dropna()
test_set.tail()

Unnamed: 0,reviews,label
8592,아메만 먹었는데 다음에 갈 땐 명란 감자 바게트도 같이 먹어 볼라구요,1
8593,주택을 개조한 카페라 그런지 찾기 어려웠어요,0
8594,그래도 음료는 맛있었습니다,1
8595,더워서 툇마루 이용을 못 해서 아쉬웠지만,2
8596,곳곳에 포토존도 있고 이뻐요,1


In [25]:
test_example, test_label = create_examples(test_set)
test_X, _ = convert_examples_to_features(test_example, test_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 8597/8597 [00:02<00:00, 4201.63it/s]


In [26]:
pred = model.predict(test_X)



In [27]:
len(pred)

8597

In [28]:
predict = pred.copy()

In [29]:
predict = np.argmax(predict, axis=1)

In [30]:
len(predict)

8597

In [31]:
test_set['pred'] = predict

In [32]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [33]:
test_set.loc[test_set['pred']==0]

Unnamed: 0,reviews,label,pred
2,술집 같은 데 커피와 음료도 팔고 있음 인테리어도 이쁘고 나쁘지 않았음,1,0
7,알바생 분인지 모르겠지만,1,0
29,뜨거운 것도 메뉴 개발 중이시라니 기대할게요,1,0
32,1 층은 좀 시끄러움,1,0
53,인기가 좋은 이유가 있어요,1,0
56,음료도 멋져요,1,0
60,1 층 자리는 거의 다 찼더라고요.,1,0
62,분위기는 좋은데,1,0
63,커피 맛은 별루,1,0
74,주말엔 사람이 많아서 사람 소리가 많이 들리지만,1,0


In [34]:
from sklearn.metrics import classification_report

print(classification_report(test_set['label'], test_set['pred']) )

              precision    recall  f1-score   support

           0       0.16      0.67      0.26       549
           1       0.99      0.50      0.67      7739
           2       0.07      0.57      0.13       309

    accuracy                           0.52      8597
   macro avg       0.41      0.58      0.35      8597
weighted avg       0.90      0.52      0.62      8597

