<a href="https://colab.research.google.com/github/SeHongPark-96/NLP_final_project/blob/main/Kor_text_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 필요 라이브러리 호출

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import os
import re
from tqdm import tqdm
from tqdm import tqdm_notebook
from matplotlib import rcParams
from collections import Counter
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

import tensorflow as tf
import tensorflow.keras.layers as tfl
from tensorflow.keras import Model
from keras.utils import np_utils
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import gensim
tf.random.set_seed(100)

# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
os.chdir('/content/drive/MyDrive/자연어처리/실습/기말고사_대체과제')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/자연어처리/실습/기말고사_대체과제/ttrain.csv', header=1)

df.head()

## 데이터 파악

In [None]:
# 상대적으로 사회 topic이 적음
print('topic : ', df['label'].unique())
df.groupby(df['label']).size()

In [None]:
plt.hist([len(title) for title in df['title']])
print('뉴스 갯수 : ', len(df))
print('뉴스 제목 최대 길이 : ', max(len(t) for t in df['title']))
print('뉴스 제목 평균 길이 : ', sum(map(len, df['title']))/ len(df['title']))

In [None]:
print('title null값 : ', sum(df['title'].isnull()))
print('label null값 : ', sum(df['label'].isnull()))

In [None]:
# 한글 외의 단어들 파악 - 주로 특수기호, 한자 혹은 영어로 된 기업명 등의 고유명사들
def get_non_ko(title_list):

    non_ko_words = []
    non_ko = re.compile('[ ㄱ-ㅣ가-힣0-9.…·+]')
    # non_ko = re.compile('[^ ㄱ-ㅣ가-힣A-Za-z]')


    for title in title_list:
        non_ko_word = non_ko.sub('', title)
        if len(non_ko_word) >= 1 :
            non_ko_words.append(non_ko_word)

    return non_ko_words

In [None]:
get_non_ko(df['title'])

## 텍스트 전처리

#### 1. 정규표현식으로 기본적인 전처리

In [None]:
percent = '오늘은 증시가 510123만원 더 올랐다'
numeric_value = re.compile(r'[0-9가-힣]*원')
print(numeric_value.sub('수치', percent))

In [None]:
# 특수문자 제거
def get_clean_words(title_list):

    clean_words = []
    numeric_value = re.compile(r'[0-9]*%') # 숫자 + %는 주제 판별에 유의미할 수 있기 때문에 "수치"로 남김
    currency = re.compile(r'[0-9가-힣]*원')
    non_word = re.compile('[^ ㄱ-ㅣ가-힣]')


    for title in title_list:
        word = numeric_value.sub(' 수치', title)
        word = currency.sub(' 금액', word)
        word = non_word.sub(' ', word)
        clean_words.append(word)

    return clean_words

In [None]:
df['cleaned_title'] = get_clean_words(df['title'])
df

#### Mecab 활용

In [None]:
# !git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git # 필요한 파일 clone
# os.chdir('./Mecab-ko-for-Google-Colab')

# ! bash install_mecab-ko_on_colab190912.sh
# os.chdir('../')


In [None]:
# from konlpy.tag import Mecab
# mecab = Mecab()

In [None]:
# stop_words_mecab = ['아니', '이달', '계속', '내달', '따라', '내년','올해', 
#               '주말', '내일', '주년', '개월', '오늘', '위해', '오후', '위한', 
#               '다음', '일부', '이후', '대신', '만나', '이번', '하루', 
#               '앞둔', '만들', '첫날', '천만', '아냐', '누구', '사실', 
#               '오전', '천억', '지난해', '잇단' ]

In [None]:
# def extract_nv_mecab(title):
#     clean_title = []
#     result = mecab.pos(title)
#     for word, tag in result:
#         if (tag.startswith('NNG') or tag.startswith('SL')) and len(word)>1 and word not in stop_words_mecab:
#             clean_title.append(word)

#     return " ".join(clean_title)

In [None]:
# def print_input_output(title):
#     print('기존 제목 : ', title)
#     print('mecab 적용 후 : ', extract_nv_mecab(title))

In [None]:
# mecab.pos(df['cleaned_title'][3])

In [None]:
# title_tokenized = []

# for title in tqdm(df['cleaned_title']):
#     try:
#         title_tokenized.append(extract_nv_mecab(title))

#     except:
#         title_tokenized.append(title)
#         print(sent)

# df['tokenized_title_mecab'] = title_tokenized

# df


In [None]:
# plt.hist([len(title) for title in df['tokenized_title_mecab']], bins=5)
# print('뉴스 갯수 : ', len(df))
# print('전처리 이후 뉴스 제목 최대 길이 : ', max(len(t) for t in df['tokenized_title_mecab']))
# print('전처리 이후 뉴스 제목 평균 길이 : ', sum(map(len, df['tokenized_title_mecab']))/ len(df['tokenized_title_mecab']))

In [None]:
# len(df[[len(x)>=42 for x in df['tokenized_title_mecab']]])

#### Okt 활용

In [None]:
!pip install konlpy

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
def okt_result(title):
    print('기존 제목 : ', title)
    print('Okt 적용 결과 : ', okt.pos(title, norm=True, stem=True))

In [None]:
okt_result(df['cleaned_title'][0])

In [None]:
stop_words_okt = ['하다', '으로', '되다', '만에', '없다', '에서', '까지', 
                  '부터', '올해', '대다', '있다', '작년', '않다', '돼다', 
                  '에도', '내년', '맞다', '오늘', '내일', '주말', '이다', 
                  '내달', '주년', '번째', '개월', '위해', '에게', '오후', 
                  '다시', '함께', '아니다', '하고', '이후', '이틀', '대신', 
                  '내다', '일부', '없이', '싶다', '첫날', '처럼', '오전',
                  '멀리', '가장', '종합']

In [None]:
def extract_nv_okt(title):
    clean_title = []
    result = okt.pos(title, norm=True, stem=True)

    for word, tag in result:
        if (tag=='Noun' or tag == 'Adjective' or tag=='Verb' or tag == 'KoreanParticle' or tag=='Adverb') and len(word)>1 and word not in stop_words_okt:
            clean_title.append(word)

    return ' '.join(clean_title)

In [None]:
okt_title_train = []

for title in tqdm(df['cleaned_title']):
    processed_w = extract_nv_okt(title)

    okt_title_train.append(processed_w)

df['tokenized_title_okt'] = okt_title_train
df

In [None]:
plt.hist([len(title) for title in df['tokenized_title_okt']], bins=5)
print('뉴스 갯수 : ', len(df))
print('전처리 이후 뉴스 제목 최대 길이 : ', max(len(t) for t in df['tokenized_title_okt']))
print('전처리 이후 뉴스 제목 평균 길이 : ', sum(map(len, df['tokenized_title_okt']))/ len(df['tokenized_title_okt']))

### 텍스트로 되어있는 label int로 인코딩

In [None]:
topics = {'IT과학' : '0', '생활문화' : '1', 
          '스포츠' : '2', '사회' : '3', 
          '세계' : '4', '정치' : '5', '경제' : '6'}

df['label'] = df['label'].map(lambda x: topics.get(x,x))
df

### 형태소 분석 결과 확인

In [None]:
# # mecab 결과

# # counter = Counter(word for title in df['tokenized_title_mecab'] for word in title.split())
# # counter = {word : frequency for word, frequency in counter.items() if frequency >= 10}
# print(sorted(counter.items(), key=lambda item:item[1], reverse=True))
# print(len(counter))

In [None]:
#okt 결과 

counter = Counter(word for title in df['tokenized_title_okt'] for word in title.split())
counter = {word : frequency for word, frequency in counter.items() if frequency >= 10}
print(sorted(counter.items(), key=lambda item:item[1], reverse=True))
print(len(counter))

num_words=len(counter)

### 데이터 분리

In [None]:
from tensorflow.keras.utils import to_categorical
train_label = to_categorical(df['label'])
train_label

In [None]:
# 데이터 분리
# max_length = 41
# df['label'] = np_utils.to_categorical(df['label'])
training_titles, validation_titles, training_labels , validation_labels = train_test_split(df['tokenized_title_okt'], train_label,
                                                                                           stratify = train_label, shuffle=True,
                                                                                           test_size=0.15, random_state=0)


print(len(training_titles))

## 모델링 및 학습

### 1. keras 활용 DNN (okt)

#### 단어 기반

In [None]:
# num_words = 1000

In [None]:
word_tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
word_tokenizer.fit_on_texts(training_titles)
word_index = word_tokenizer.word_index

print(len(word_index))

In [None]:
# max(([len(x.split()) for x in df['tokenized_title_mecab']]))

In [None]:
max_length = 13
word_training_sequences = word_tokenizer.texts_to_sequences(training_titles)
word_training_padded = pad_sequences(word_training_sequences, maxlen=max_length, padding='post', truncating='post')

word_validation_sequences = word_tokenizer.texts_to_sequences(validation_titles)
word_validation_padded = pad_sequences(word_validation_sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
print(word_training_padded.shape)
print(word_validation_padded.shape)

#### 단어 기반 모델 model

In [None]:
# Conv1D
word_dim = 32

word_model = tf.keras.Sequential([
                             tfl.Embedding(input_dim=num_words, output_dim=word_dim, input_length=max_length),
                             tfl.Dropout(0.2),
                             tfl.Conv1D(32, 3, padding='same', activation='relu'),
                             tfl.GlobalMaxPooling1D(),
                             tfl.Dense(250, activation='relu'),
                             tfl.Dropout(0.2),
                             tfl.Dense(7, activation='softmax')
])


word_model.summary()

In [None]:
# RNN
word_dim = 32

word_model = tf.keras.Sequential([
                             tfl.Embedding(input_dim=num_words, output_dim=word_dim, input_length=max_length),
                            #  tfl.GRU(64, return_sequences=True),
                             tfl.Bidirectional(tfl.GRU(32, return_sequences=True)),
                             tfl.GlobalMaxPooling1D(),
                             tfl.Dropout(0.4),
                            #  tfl.Dense(16, activation='relu'),
                            #  tfl.Dropout(0.3),
                             tfl.Dense(7, activation='softmax')
])


word_model.summary()

In [None]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 0.001,
    decay_steps = 100,
    decay_rate = 0.96,
    staircase=True
)

word_model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr_schedule), metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, 
                                                  restore_best_weights=True, verbose=1)

history = word_model.fit(word_training_padded, training_labels,
                    epochs=100, verbose=2, batch_size=256, shuffle=True,
                    validation_data=(word_validation_padded, validation_labels),
                    callbacks = [early_stopping])

### 2. Tf-Idf

#### tfidf 벡터화

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,2),
                             max_features=num_words)

tfidf_train = vectorizer.fit_transform(training_titles).todense()
tfidf_validation = vectorizer.transform(validation_titles).todense()

#### 모델링

In [None]:
tf_idf_model = tf.keras.Sequential([
                             tfl.Dense(256,input_shape = (num_words,)),
                            #  tfl.BatchNormalization(),
                             tfl.Activation('relu'),
                             tfl.Dropout(0.5),

                            #  tfl.Dense(128),
                            #  tfl.BatchNormalization(),
                            #  tfl.Activation('relu'),
                            #  tfl.Dropout(0.3),

                             tfl.Dense(64),
                            #  tfl.BatchNormalization(),
                             tfl.Activation('relu'),
                             tfl.Dropout(0.3),
                             
                             tfl.Dense(7, activation='softmax')
])
lr_shedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 1e-2,
    decay_steps = 100,
    decay_rate = 0.96,
    staircase=True
)
tf_idf_model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr_schedule), metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, 
                                                  restore_best_weights=True, verbose=1)

history = tf_idf_model.fit(tfidf_train, training_labels,
                    epochs=100, verbose=2, batch_size=128,
                    validation_data=(tfidf_validation, validation_labels),
                    callbacks = [early_stopping])

## 평가 
- ttrain을 train / validation으로 나누어 진행했을시, 80%의 성능을 보였지만, ttrain으로 학습 후 ttest에 적용했을때 60 후반대의 성능 밖에 안나옴. ttest의 단어들이 ttrain에 없는 경우들이 상당히 나옴. 
- 제목이 적은 수의 단어로 이루어진 경우들이 있어 충분한 학습 데이터 부족으로 성능의 한계가 보임.
- 추가적인 데이터 수집이 필요해보임 

### Test 데이터 불러오기

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/자연어처리/실습/기말고사_대체과제/ttest.csv', header=1)
df_test.head()

In [None]:
df_test['cleaned_title'] = get_clean_words(df_test['title'])
df_test

In [None]:
# test_title_tokenized = []

# for title in tqdm(df_test['cleaned_title']):
#     try:
#         test_title_tokenized.append(extract_nv_mecab(title))

#     except:
#         test_title_tokenized.append(title)
#         print(title)

# df_test['tokenized_title_mecab'] = test_title_tokenized

# df_test

In [None]:
okt_title_test = []

for title in tqdm(df_test['cleaned_title']):
    processed_w = extract_nv_okt(title)

    okt_title_test.append(processed_w)

df_test['tokenized_title_okt'] = okt_title_test
# df_test

In [None]:
df_test['label'] = df_test['label'].map(lambda x: topics.get(x,x))
df_test

#### keras

In [None]:
# validation을 위해 나눴던 것과 다르게 test 예측에는 모두 활용
max_length = 14
all_word_tokenizer = Tokenizer(oov_token='<OOV>')
all_word_tokenizer.fit_on_texts(df['tokenized_title_okt'])
all_word_sequences = all_word_tokenizer.texts_to_sequences(df['tokenized_title_okt'])
all_word_padded = pad_sequences(all_word_sequences, maxlen=max_length, padding='post', truncating='post')

all_word_labels = to_categorical(df['label'])

In [None]:
len(all_word_tokenizer.word_index)

In [None]:
word_test_sequences = all_word_tokenizer.texts_to_sequences(df_test['tokenized_title_okt'])
word_test_padded = pad_sequences(word_test_sequences, maxlen=max_length, padding='post', truncating='post')
test_labels = to_categorical(df_test['label'])

In [None]:
df_test

In [None]:
all_word_tokenizer.sequences_to_texts(word_test_sequences)

In [None]:
num_words= 7500

word_dim = 200

word_model = tf.keras.Sequential([
                             tfl.Embedding(input_dim=num_words, output_dim=word_dim, input_length=max_length),
                             tfl.Dropout(0.2),
                             tfl.Conv1D(32, 3, padding='same', activation='relu'),
                             tfl.GlobalMaxPooling1D(),
                             tfl.Dense(32, activation='relu'),
                             tfl.Dropout(0.2),
                             tfl.Dense(7, activation='softmax')
])


word_model.summary()


lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 0.001,
    decay_steps = 100,
    decay_rate = 0.96,
    staircase=True
)

word_model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr_schedule), metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, 
                                                  restore_best_weights=True, verbose=1)

# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, 
#                                                   restore_best_weights=True, verbose=1)

history = word_model.fit(all_word_padded, all_word_labels,
                    epochs=20, verbose=2, batch_size=128, shuffle=True,
                    validation_data = (word_test_padded, test_labels),
                    callbacks = [early_stopping])

In [None]:
word_model.evaluate(word_test_padded, test_labels)

#### tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

final_vectorizer = TfidfVectorizer(ngram_range=(1,2),
                             max_features=num_words)

tfidf_final = final_vectorizer.fit_transform(df['tokenized_title_mecab']).todense()


In [None]:
tfidf_test = final_vectorizer.transform(df_test['tokenized_title_mecab']).todense()
test_labels = to_categorical(df_test['label'])

In [None]:
tf_idf_model = tf.keras.Sequential([
                             tfl.Dense(128,input_shape = (num_words,)),
                            #  tfl.BatchNormalization(),
                             tfl.Activation('relu'),
                             tfl.Dropout(0.4),

                            #  tfl.Dense(128),
                            #  tfl.BatchNormalization(),
                            #  tfl.Activation('relu'),
                            #  tfl.Dropout(0.3),

                             tfl.Dense(32),
                            #  tfl.BatchNormalization(),
                             tfl.Activation('relu'),
                             tfl.Dropout(0.4),
                             
                             tfl.Dense(7, activation='softmax')
])
lr_shedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 1e-2,
    decay_steps = 100,
    decay_rate = 0.96,
    staircase=True
)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5,
                                                  restore_best_weights=True, verbose=1)

tf_idf_model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr_schedule), metrics=['accuracy'])

history = tf_idf_model.fit(tfidf_final, all_word_labels,
                    epochs=20, verbose=2, batch_size=128,
                    shuffle=True, 
                    validation_data=(tfidf_test, test_labels),
                    callbacks = [early_stopping])

In [None]:
tf_idf_model.evaluate(tfidf_test, test_labels)

ML 적용

In [None]:
# labels = df['label']
# test_labels = df_test['label']

In [None]:
# from sklearn.svm import LinearSVC
# from sklearn import metrics
# svm = LinearSVC(C=1)
# svm.fit(tfidf_final, labels)
# pred = svm.predict(tfidf_test)

# print(metrics.accuracy_score(test_labels, pred))

## 추가) 사전 학습된 모델 사용하기

### KoBert
- KoBert 사용시 정확도가 0.8까지 올라가는 것을 보아 학습 데이터 부족의 문제도 있음을 확인

출처 : https://github.com/SKTBrain/KoBERT

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import gluonnlp as nlp


In [None]:
device = torch.device('cuda:0')

In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

In [None]:
max_length = 14
batch_size = 128
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [None]:
dataset_train = nlp.data.TSVDataset('/content/drive/MyDrive/자연어처리/실습/기말고사_대체과제/df.tsv', field_indices=[1,0], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset('/content/drive/MyDrive/자연어처리/실습/기말고사_대체과제/df_test.tsv', field_indices=[1,0], num_discard_samples=1)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
dataset_train = BERTDataset(dataset_train, 0, 1, tok, max_length, True, False)
dataset_test = BERTDataset(dataset_test, 0, 1, tok, max_length, True, False)

In [None]:
#배치 및 데이터로더 설정
train_dataloader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, num_workers=4)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7, ##주의: 클래스 수 바꾸어 주세요!##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} validation acc {}".format(e+1, test_acc / (batch_id+1)))