In [1]:
import zipfile

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import platform
%matplotlib inline
import matplotlib
import tensorflow as tf
from sklearn.metrics import roc_curve,confusion_matrix,auc,  roc_auc_score
BATCH_SIZE=6 # 커널이 계속 죽는다.. (메모리 초과...;) -> 배치 사이즈가 6보다 크면 메모리 초과 발생;
MAX_SEQ_LEN = 512

import os # GPU 설정
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

import warnings # warning message ignore
warnings.filterwarnings(action='ignore')

# install transformers
# !pip install transformers
import transformers

# model import
from transformers import DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.model_selection import train_test_split

# 한글 폰트 설정
if platform.system() == 'Darwin': # mac
        plt.rc('font', family='AppleGothic') 
else: # window or linux
        plt.rc('font', family='NanumGothicCoding') 

In [2]:
# dataset.zip 압축파일 해제하기
if not os.listdir("./dataset"):

    data_set = "./dataset.zip"
    zip_ref = zipfile.ZipFile(data_set, 'r')
    zip_ref.extractall('./dataset')
    zip_ref.close()

In [3]:
train = pd.read_csv("./dataset/dataset/train.csv")
train

Unnamed: 0,id,document,label
0,1,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0
1,2,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯...,1
2,3,괜찮은 음악영화가 또 나왔군요!!! 따뜻한 겨울이 될 것 같아요~,1
3,4,아무래도 20년도지난작품이라 지금보기는너무유치하다,0
4,5,지금까지의 영화들이 그랬듯. 이 영화역시 일본에 대한 미화는 여전하다.,0
...,...,...,...
4995,4996,좋은 배우들로 3류영화를 찍은 안타까운 영화,0
4996,4997,진짜 드럽게 재미없다 에드워드 호퍼 그림에 배경 빼고는 볼게 아닌영화,0
4997,4998,가장 실망스러운 영화.. 지금까지 본영화중..,0
4998,4999,"이런 평점 테러, 네이버에서 좀 막아야 하는 것 아닌가?",1


In [4]:
train.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [5]:
MODEL_NAME = 'distilbert-base-multilingual-cased'  # DOES NOT WORK
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [6]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 2)

Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream ta

In [7]:
X = train["document"].to_list()
y = train["label"].to_list()

In [8]:
X_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
train_encoding = tokenizer(X_train,
                            truncation=True, 
                            padding=True)
val_encoding = tokenizer(x_val,
                            truncation=True, 
                            padding=True)

In [10]:
# 학습 데이터셋으로 변환
# train - validation batch size 반드시 맞춰서 입력해야함
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encoding),
    y_train
)).shuffle(len(X_train)).repeat().batch(BATCH_SIZE)

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encoding),
    y_val
)).batch(BATCH_SIZE)

In [11]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=opt, loss=loss, metrics=[metric])
history = model.fit(train_dataset, epochs=4, steps_per_epoch=len(X_train)//BATCH_SIZE, # Early Stopping으로 학습 최대점은 에폭 4회일 때임
                    validation_data=val_dataset, validation_steps=len(x_val)//BATCH_SIZE, verbose=1)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [12]:
test = pd.read_csv("./dataset/dataset/test.csv") #파일 읽기

In [13]:
X_test = test["document"].to_list()

In [14]:
test_encoding = tokenizer(X_test,
                            truncation=True, 
                            padding=True)

In [15]:
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encoding),
)).batch(BATCH_SIZE)

In [16]:
prediction = model.predict(test_dataset, verbose=1)



In [17]:
submission = pd.read_csv("./dataset/dataset/sample_submission.csv")
submission

Unnamed: 0,id,label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
4995,4996,0
4996,4997,0
4997,4998,0
4998,4999,0


In [18]:
answer = np.array([y.argmax() for y in prediction["logits"]])
submission["label"] = answer

In [19]:
submission

Unnamed: 0,id,label
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
4995,4996,1
4996,4997,0
4997,4998,1
4998,4999,0


In [20]:
submission.to_csv("./dataset/dataset/submission.csv", index=False)