## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModel, TFAutoModel, ElectraModel, ElectraTokenizer
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Load Dataset

In [None]:
train = pd.read_csv("train.csv", encoding='CP949')
test = pd.read_csv("test.csv", encoding='CP949')
sub = pd.read_csv("sample.csv", encoding='CP949')

## EDA

### .head() 함수로 데이터셋에 대한 기본 정보 확인

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sub.head()

train 19996개, test 5000개

In [None]:
print(f"train shape => {train.shape} \ntest shape => {test.shape}")

In [None]:
train.columns

### LabelEncoder를 활용해 categorical features를 encode함
contradiction => 0  
entailment => 1  
neutral => 2

In [None]:
categorical_cols = ['label']
for e in categorical_cols:
    le = LabelEncoder()
    train[e] = le.fit_transform(train[e])

In [None]:
train.head()

In [None]:
train['label'].values

### 라벨값이 balaced 한지 imbalanced 한지 확인하는 visualization
만약 학습 데이터가 imbalanced 하다면, 추가적인 고민이 필요하지만  
시각화 결과 클래스 값들이 고르게 분포되어 있음.

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="label", data=train)

In [None]:
labels, frequencies = np.unique(train.label.values, return_counts=True)
plt.figure(figsize=(5,5))
plt.pie(frequencies, labels = labels, autopct= '%1.1f%%')
plt.show()

### 훈련 데이터 결측치 계산  
없음

In [None]:
train.isnull().sum()

## Modeling - BERT

In [None]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
model_name = "monologg/koelectra-base-v3-discriminator"

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
tokenizer.tokenize("안녕하세요. 인코딩 테스트 문장입니다.")

In [None]:
encode_sentence("안녕하세요. 인코딩 테스트 문장입니다.")

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
    sentence1 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(hypotheses)])
    sentence2 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(premises)])
    
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    
    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat(
        [type_cls, type_s1, type_s2], axis=-1).to_tensor()
    
    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids}
    
    return inputs

In [None]:
train.premise.values

In [None]:
train.hypothesis.values

In [None]:
train_input = bert_encode(train.hypothesis.values, train.premise.values, tokenizer)

In [None]:
train_input

In [None]:
from transformers import TFElectraModel

max_len = 98

def build_model():
    bert_encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
model.fit(train_input, train.label.values, epochs = 2, verbose = 1, batch_size = 32, validation_split = 0.2)

In [None]:
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
submission = test.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.head()