In [1]:
!pip install transformers
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertTokenizerFast, TFBertForSequenceClassification, RobertaTokenizer, TFRobertaForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split


from google.colab import drive
drive.mount('/content/drive')

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
traindata = pd.read_csv('/content/train.csv')
testdata = pd.read_csv('/content/test.csv')

print("훈련용 개수:", len(traindata))
print("테스트용 개수:", len(testdata))


훈련용 개수: 32000
테스트용 개수: 48000


In [3]:
# 중복 데이터, 결측값 제거

# text 열 중복 제거
traindata.drop_duplicates(subset=['text'], inplace=True)

# null 값 존재하는 행 제거
traindata = traindata.dropna(how='any')
testdata = testdata.dropna(how = 'any')
print('훈련 데이터 수:', len(traindata))
print('테스트 데이터 수:', len(testdata))


훈련 데이터 수: 31972
테스트 데이터 수: 48000


In [6]:
# 토크나이저 이용한 정수 인코딩
tokenizer = RobertaTokenizer.from_pretrained('textattack/roberta-base-MNLI')

X_train_list = traindata['text'].tolist()
X_test_list = testdata['text'].tolist()
y_train = traindata['sentiment'].tolist()

# 훈련 데이터를 훈련 세트와 검증 세트로 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(X_train_list, y_train, test_size=.2)
# 토크나이징과 데이터셋 준비
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# 토큰화된 첫 번째 결과 출력
print(train_encodings['input_ids'][0]) # 정수 인코딩 결과
print(train_encodings['attention_mask'][0]) # 어텐션 마스크


"""
# 소문자 변환 모델 토크나이저
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(tokenizer.tokenize())

model. = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)
"""

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

[0, 1039, 25158, 47143, 8303, 787, 100, 3145, 219, 133, 495, 1517, 8939, 91, 21, 1828, 25, 7105, 350, 4, 9427, 77, 37, 702, 19, 10, 3187, 865, 116, 10915, 1071, 56, 10, 5006, 869, 18, 14014, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

"\n# 소문자 변환 모델 토크나이저\ntokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\nprint(tokenizer.tokenize())\n\nmodel. = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)\n"

In [7]:
# 데이터셋 생성 및 모델 학습

# 데이터를 텐서플로우의 데이터셋 형태로 변환
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

model = TFRobertaForSequenceClassification.from_pretrained(
    "textattack/roberta-base-MNLI",
    num_labels=3,
    from_pt = True,
)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

test_encodings = tokenizer(X_test_list, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings)))


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [9]:
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)


early_stopping = EarlyStopping(
    monitor = 'val_accuracy',
    patience=3,
    restore_best_weights = True,
)

model.fit(
    train_dataset.shuffle(10000).batch(16),
    epochs=10,
    batch_size=16,
    validation_data = val_dataset.shuffle(10000).batch(16),
    callbacks = [early_stopping],
)

predictions = model.predict(test_dataset.batch(16))


Epoch 1/10
 188/1599 [==>...........................] - ETA: 9:40 - loss: 0.0865 - accuracy: 0.9707

KeyboardInterrupt: ignored

In [10]:
# 예측 결과 처리 및 클래스에 매핑
predicted_classes = np.argmax(predictions.logits, axis=1)

# 제출 파일 생성
submission = pd.DataFrame({
    "id": testdata['id'],
    "sentiment": predicted_classes
})

# 데이터프레임을 CSV 파일로 저장
submission.to_csv('/content/roberta2.csv', index=False)

- hf_compute_loss : 다중 클래스 분류에서 사용하던 손실함수 크로스 엔트로피 매핑되어있음