In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# 데이터 불러오기
data = pd.read_csv('/content/drive/MyDrive/공장/train_fff.csv')

idx = data[data['label']==2].index
data.drop(idx , inplace=True)

data = data.sample(frac=1)

data['label'].value_counts()

0    700
1    700
Name: label, dtype: int64

In [5]:
train_data, test_data = train_test_split(data, test_size=.2, random_state=2022)

print('훈련용 리뷰 개수 :', len(train_data))
print('테스트용 리뷰 개수 :', len(test_data))

훈련용 리뷰 개수 : 1120
테스트용 리뷰 개수 : 280


---

In [6]:
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [7]:
def create_examples(df):
    example = []
    label = []
    for (i, row) in enumerate(df.values):
        example.append(row[0])
        label.append(row[1:])
    
    return example, label

In [8]:
train_example, train_label = create_examples(train_data)
test_example, test_label = create_examples(test_data)

In [9]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [10]:
max_seq_len = 128

In [11]:
train_X, train_y = convert_examples_to_features(train_example, train_label, max_seq_len=max_seq_len, tokenizer=tokenizer)
test_X, test_y = convert_examples_to_features(test_example, test_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|          | 0/1120 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 1120/1120 [00:00<00:00, 4986.82it/s]
100%|██████████| 280/280 [00:00<00:00, 5234.12it/s]


In [12]:
base_model = TFBertModel.from_pretrained("klue/bert-base", from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'bert.embeddings.position_ids', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [13]:
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

x = base_model.bert([input_ids_layer, attention_masks_layer, token_type_ids_layer])[1]
outputs = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02), activation='sigmoid', name='classifier')(x)

In [14]:
class TFBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(1,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='sigmoid',
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [15]:
model = tf.keras.models.Model(inputs=[input_ids_layer, attention_masks_layer, token_type_ids_layer], outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer,  loss=loss, metrics = ['accuracy'])

In [16]:
model.fit(train_X, train_y, epochs=5, batch_size=16, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1e31689970>

In [17]:
results = model.evaluate(test_X, test_y, batch_size=8)
print("test loss, test acc: ", results)

test loss, test acc:  [0.43404513597488403, 0.8785714507102966]


In [18]:
model.predict(test_X)



array([[5.7624775e-04],
       [4.3686056e-03],
       [9.8941642e-01],
       [9.9917573e-01],
       [9.9906021e-01],
       [9.9801755e-01],
       [1.1070370e-02],
       [6.0794955e-01],
       [9.9879253e-01],
       [7.0077246e-01],
       [4.6862548e-04],
       [4.8948936e-03],
       [4.0427604e-04],
       [9.9758577e-01],
       [1.9313545e-04],
       [9.9961603e-01],
       [9.9341488e-01],
       [9.9981540e-01],
       [1.1259083e-04],
       [8.8930305e-04],
       [2.9319723e-04],
       [9.9944240e-01],
       [5.0472573e-04],
       [9.9902904e-01],
       [9.1650498e-01],
       [3.2678616e-01],
       [7.9625010e-05],
       [9.7904086e-01],
       [4.0084790e-03],
       [1.3684400e-04],
       [5.0249521e-04],
       [9.9941730e-01],
       [1.1830962e-02],
       [9.8212665e-01],
       [2.1684734e-04],
       [9.9929917e-01],
       [9.9922442e-01],
       [9.9896061e-01],
       [8.3175766e-01],
       [9.2622991e-05],
       [8.8892025e-01],
       [7.934001

In [19]:
model.save("/content/drive/MyDrive/공장/SA_kobert_model.")



In [20]:
test_set = pd.read_csv('/content/drive/MyDrive/공장/test_fff.csv')
# test_set = test_set[['reviews']]
# test_set['y'] = 0
test_set = test_set.dropna()
idx = test_set[test_set['label']==2].index
test_set.drop(idx , inplace=True)
test_set = test_set.sample(frac=1)

test_set = test_set.sample(500)

test_set['label'].value_counts()

1    467
0     33
Name: label, dtype: int64

In [21]:
test_example, test_label = create_examples(test_set)
test_X, _ = convert_examples_to_features(test_example, test_label, max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 500/500 [00:00<00:00, 5206.27it/s]


In [22]:
pred = model.predict(test_X)



In [23]:
len(pred)

500

In [24]:
predict = pred.copy()

In [25]:
# predict = np.argmax(predict, axis=1)
predict = np.where(pred>0.5, 1, 0).reshape(-1)

In [26]:
len(predict)

500

In [27]:
test_set['pred'] = predict

In [28]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [29]:
# test_set.loc[test_set['pred']==0]
test_set

Unnamed: 0,reviews,label,pred
5541,역대 급 인생 도너츠..,1,1
5836,늦은 저녁이나 사람 많은 시간 피해 가시길...,0,0
987,근데 초코파이 종류는 똑같아요,1,0
5659,시그니처 메뉴 꼭 드셔 보세요,1,1
2762,사진 찍기 좋아요,1,1
2081,분위기 좋고,1,1
4149,무엇 보다 생 애플 망고 빙수가 너무 맛있었어요,1,1
787,소문 듣고 찾아온 제주도 빵집!!,1,1
5513,메뉴가 다 빠지고 없어서 우도 땅콩 도너츠만 샀는데 너무 맛있어서 그 다음 날 또 갔어요,1,1
7227,토요일인데도 운이 좋아서 웨이팅 짧아서 완전 좋았어요!,1,1


In [30]:
from sklearn.metrics import classification_report

print(classification_report(test_set['label'], test_set['pred']) )

              precision    recall  f1-score   support

           0       0.51      0.85      0.64        33
           1       0.99      0.94      0.96       467

    accuracy                           0.94       500
   macro avg       0.75      0.90      0.80       500
weighted avg       0.96      0.94      0.94       500

