In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# 데이터 불러오기
data = pd.read_csv('/content/drive/MyDrive/공장/review_data_f.csv')
data = data.drop('Unnamed: 0', axis=1)
data

Unnamed: 0,review,y
0,3대째 이어오고 있는 춘천시 소양로 2가에 위치한 막국수 전문점. 숨은 맛 집답게 ...,1
1,공간이 많이 좁았지만 햇살이 잘 들어와서 너무 이뻤던 카페입니다 레몬 빵이 시큼하고...,1
2,비싸고 맛없어요!,0
3,아기자기하고 이쁜 곳! 맛도 좋다!! 배도 부르다! 일요일 점심 무더위에도 웨이팅을...,1
4,막걸리 비싸고 주인장의 고압적인 응대에 위축되고 테이블은 다닥다닥 붙어있고 술 마시...,0
...,...,...
2062,웬만하면 리뷰 안다는데 개만 없고 개비 싸고 개불 친절함,0
2063,양념이 좀 많이 달아요. 육수는 조미료 맛 많이 나서 별로예요. 직접 면을 뽑는 ...,0
2064,근처 맛 집 검색해서 찾아가 봤어요 음식이 전체적으로 다 맛있네요 깔끔한 내부도 마...,1
2065,최악 여길 왜가는지 모르겠음. 마늘 손질하던 아줌마가 카드 계산하고 다시 가서 마늘...,0


In [5]:
train_data, test_data = train_test_split(data, test_size=.2)

print('훈련용 리뷰 개수 :', len(train_data))
print('테스트용 리뷰 개수 :', len(test_data))

훈련용 리뷰 개수 : 1653
테스트용 리뷰 개수 : 414


---

In [6]:
# # GPU 환경 설정
# # assert tf.test.is_gpu_available() == True, 'GPU 설정을 확인하세요.'
# print(tf.config.list_physical_devices('GPU'))
# print(tf.config.list_logical_devices('GPU'))

In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [15]:
def create_examples(df):
    example = []
    label = []
    for (i, row) in enumerate(df.values):
        example.append(row[0])
        label.append(row[1:])
    
    return example, label

In [16]:
train_example, train_label = create_examples(train_data)
test_example, test_label = create_examples(test_data)

In [17]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [18]:
max_seq_len = 256

In [19]:
train_X, train_y = convert_examples_to_features(train_example, train_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|          | 0/1653 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 1653/1653 [00:02<00:00, 572.52it/s]


In [20]:
test_X, test_y = convert_examples_to_features(test_example, test_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 414/414 [00:00<00:00, 687.32it/s]


In [21]:
model = TFBertModel.from_pretrained("bert-base-multilingual-cased", from_pt=True)

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [22]:
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

In [23]:
class TFBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(1,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='sigmoid',
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [26]:
model = TFBertForSequenceClassification("bert-base-multilingual-cased")
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [27]:
model.fit(train_X, train_y, epochs=2, batch_size=16, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2f6a3a8610>

In [28]:
results = model.evaluate(test_X, test_y, batch_size=8)
print("test loss, test acc: ", results)

test loss, test acc:  [0.3951782286167145, 0.8550724387168884]


In [29]:
model.predict(test_X)



array([[0.16288337],
       [0.44585213],
       [0.17821367],
       [0.35550883],
       [0.1694465 ],
       [0.30632037],
       [0.7358334 ],
       [0.74116963],
       [0.3596613 ],
       [0.14470717],
       [0.9220794 ],
       [0.7518225 ],
       [0.9257964 ],
       [0.16095056],
       [0.18653691],
       [0.6277997 ],
       [0.1561538 ],
       [0.16824354],
       [0.5189366 ],
       [0.26733488],
       [0.22684239],
       [0.81952596],
       [0.2456496 ],
       [0.25609198],
       [0.74400204],
       [0.14239325],
       [0.55602235],
       [0.76397645],
       [0.41133386],
       [0.2723984 ],
       [0.8473135 ],
       [0.51643234],
       [0.54700756],
       [0.31236407],
       [0.45479637],
       [0.47686064],
       [0.7300312 ],
       [0.8250188 ],
       [0.23485823],
       [0.15882139],
       [0.41404524],
       [0.2860511 ],
       [0.46493283],
       [0.38194266],
       [0.9251263 ],
       [0.20356207],
       [0.9113754 ],
       [0.212

In [30]:
test_set = pd.read_csv('/content/drive/MyDrive/공장/train_x_total.csv')
test_set = test_set[['reviews']]
test_set['y'] = 0
test_set = test_set.dropna()
test_set.head()

Unnamed: 0,reviews,y
0,만족도 최상의 커피 맛집,0
1,찐 커피 맛집,0
2,로마노 진짜 달달하고 맛있고,0
3,아이스크림 라떼도 꼬소하고 달달하고 진짜 맛있어요!!,0
4,마리토쪼도 크림 부드럽고,0


In [31]:
test_example, test_label = create_examples(test_set)
test_X, _ = convert_examples_to_features(test_example, test_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 10292/10292 [00:03<00:00, 2594.59it/s]


In [32]:
pred = model.predict(test_X)



In [33]:
pred

array([[0.5736812 ],
       [0.81933594],
       [0.6300249 ],
       ...,
       [0.76487994],
       [0.27208036],
       [0.6189232 ]], dtype=float32)

In [38]:
pred = np.where(pred>0.5, 1, 0).reshape(-1)

In [40]:
test_set['pred'] = pred

In [44]:
test_set.loc[test_set['pred']==1]

Unnamed: 0,reviews,y,pred
0,만족도 최상의 커피 맛집,0,1
1,찐 커피 맛집,0,1
2,로마노 진짜 달달하고 맛있고,0,1
3,아이스크림 라떼도 꼬소하고 달달하고 진짜 맛있어요!!,0,1
4,마리토쪼도 크림 부드럽고,0,1
...,...,...,...
10287,사진에서 보던 느낌 그대로 고즈넉하고 조용했어요 딱 제가 좋아하는 느낌,0,1
10288,강릉 오면 꼭 추천!!,0,1
10290,아메만 먹었는데 다음에 갈 땐 명란 감자 바게트도 같이 먹어 볼라구요,0,1
10292,그래도 음료는 맛있었습니다,0,1
