# Prepare ML
* 필요한 모듈 설치 및 임포트
* 깃을 통한 BERT HuggingFace 사용하기
* 파라미터 세팅

In [1]:
!pip install gluonnlp pandas tqdm   
!pip install mxnet
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 4.9 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37m-linux_x86_64.whl size=595738 sha256=717e032a7ebf7a027b6957b0b8694313eadd49ea6fe2eadbe032ec83d33ed1a5
  Stored in directory: /root/.cache/pip/wheels/be/b4/06/7f3fdfaf707e6b5e98b79c041e023acffbe395d78a527eae00
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[K     |████████████████████████████████| 49.1 MB 2.2 MB/s 
Collecting graphviz<0.9.0,>=0.8.1
  

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from google.colab import drive

In [3]:
drive.mount('/content/drive')
device = torch.device("cuda:0")
# device = torch.device("cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

Downloading spiece.model:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Downloading config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/352M [00:00<?, ?B/s]

In [5]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

tok = tokenizer.tokenize

# ML Model Class and Funcs
* 데이터셋 클래스
* BERT 분류 담당 클래스
* 입력값 데이터를 토큰화, argmax를 통해 가장 일치값이 높은 라벨 출력하는 predict 함수

In [6]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         

    def __len__(self):
        return (len(self.labels))

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [8]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [9]:
def predict(predict_sentence):
    data = [predict_sentence, '0']
    dataset_another = [data]
    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=2)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

# Train Model
* 데이터 셋(csv)을 모델 훈련에 적합한 라벨:데이터 구조로 변환
* 훈련 모델을 train_test_split을 통해 1:4로 나누기
* 기존의 학습된 BERT 토크나이저 사용


In [None]:
train_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/감성대화말뭉치_최종데이터__Training.csv')
validation_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/감성대화말뭉치_최종데이터__Validation.csv')

In [None]:
train_set.info()

In [None]:
train_set['감정_대분류'].unique()

In [None]:
train_set = train_set.loc[:, ['감정_대분류', '사람문장1']]
validation_set = validation_set.loc[:, ['감정_대분류', '사람문장1']]
train_set.dropna(inplace=True)
validation_set.dropna(inplace=True)
train_set.columns = ['label', 'data']
validation_set.columns = ['label', 'data']

In [None]:
train_set['label'].unique()

In [None]:
train_set.loc[(train_set['label'] == '기쁨'), 'label'] = 0
train_set.loc[(train_set['label'] == '기쁨 '), 'label'] = 0
train_set.loc[(train_set['label'] == '불안'), 'label'] = 1
train_set.loc[(train_set['label'] == '불안 '), 'label'] = 1
train_set.loc[(train_set['label'] == '당황'), 'label'] = 2
train_set.loc[(train_set['label'] == '슬픔'), 'label'] = 3
train_set.loc[(train_set['label'] == '분노'), 'label'] = 4
train_set.loc[(train_set['label'] == '상처'), 'label'] = 5

In [None]:
train_set.sample(10)

In [None]:
validation_set['label'].unique()

In [None]:
validation_set.loc[(validation_set['label'] == '기쁨'), 'label'] = 0                    
validation_set.loc[(validation_set['label'] == '불안'), 'label'] = 1                   
validation_set.loc[(validation_set['label'] == '당황'), 'label'] = 2                    
validation_set.loc[(validation_set['label'] == '슬픔'), 'label'] = 3                    
validation_set.loc[(validation_set['label'] == '분노'), 'label'] = 4                    
validation_set.loc[(validation_set['label'] == '상처'), 'label'] = 5                    

In [None]:
validation_set.sample()

In [None]:
validation_set['label'].unique()

In [None]:
train_set_data = [[i, str(j)] for i, j in zip(train_set['data'], train_set['label'])]
validation_set_data = [[i, str(j)] for i, j in zip(validation_set['data'], validation_set['label'])]

In [None]:
train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

In [None]:
train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=2)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/511 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.8544056415557861 train acc 0.125
epoch 1 batch id 201 loss 1.2539284229278564 train acc 0.35517723880597013
epoch 1 batch id 401 loss 1.145894169807434 train acc 0.47093204488778057
epoch 1 train acc 0.5031402401143106


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/128 [00:00<?, ?it/s]

epoch 1 test acc 0.6222330729166666


  0%|          | 0/511 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.0469151735305786 train acc 0.578125
epoch 2 batch id 201 loss 1.045835256576538 train acc 0.6215018656716418
epoch 2 batch id 401 loss 1.0081950426101685 train acc 0.6388715710723192
epoch 2 train acc 0.647220960301929


  0%|          | 0/128 [00:00<?, ?it/s]

epoch 2 test acc 0.63818359375


  0%|          | 0/511 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.930655300617218 train acc 0.65625
epoch 3 batch id 201 loss 0.832099199295044 train acc 0.6826026119402985
epoch 3 batch id 401 loss 0.8286339640617371 train acc 0.6984491895261845
epoch 3 train acc 0.7074917295685397


  0%|          | 0/128 [00:00<?, ?it/s]

epoch 3 test acc 0.635986328125


  0%|          | 0/511 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.8202559947967529 train acc 0.75
epoch 4 batch id 201 loss 0.6573715209960938 train acc 0.75
epoch 4 batch id 401 loss 0.7071491479873657 train acc 0.7627026184538653
epoch 4 train acc 0.7696553792749976


  0%|          | 0/128 [00:00<?, ?it/s]

epoch 4 test acc 0.6448974609375


  0%|          | 0/511 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.6997534036636353 train acc 0.8125
epoch 5 batch id 201 loss 0.5330895781517029 train acc 0.7971859452736318
epoch 5 batch id 401 loss 0.7349119186401367 train acc 0.8056811097256857
epoch 5 train acc 0.8095296337713167


  0%|          | 0/128 [00:00<?, ?it/s]

epoch 5 test acc 0.6463216145833334


In [None]:
torch.save(model, f'/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisKOBert.pt')
torch.save(model.state_dict(), f'/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisKOBert_StateDict.pt')

# Load Model
* 저장된 모델 로드
* Colab의 VM에 플라스크를 사용한 로컬 호스트 사용 
* 로컬 호스트 외부 네트워크 연결을 위한 ngrok 연동
* 간단한 API를 통해 데이터 입출력 함수 구현

In [10]:
model = torch.load(f'/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisKOBert.pt')
model.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [11]:
!pip install flask-ngrok
!pip install flask==0.12.2
!pip install pyngrok==4.1.1
!ngrok authtoken '2E0itmXyrnKa7DoJmLdkZxE4Hk3_2hreUgB64mTNMJs6RjKfZ'
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
!tar -xvf /content/ngrok-stable-linux-amd64.tgz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flask==0.12.2
  Downloading Flask-0.12.2-py2.py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.3 MB/s 
Installing collected packages: flask
  Attempting uninstall: flask
    Found existing installation: Flask 1.1.4
    Uninstalling Flask-1.1.4:
      Successfully uninstalled Flask-1.1.4
Successfully installed flask-0.12.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok==4.1.1
  Downloading pyngrok-4.1.1.tar.gz (18 kB)
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdo

In [12]:
from flask import Flask, jsonify, request
from flask_ngrok import run_with_ngrok
import requests

In [None]:
app = Flask(__name__)
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False
run_with_ngrok(app)  # Start ngrok when app is run

@app.route('/analysis', methods=['POST'])
def analysis():
    content = request.get_json()
    print(content)
    text = content['content']
    label = predict(text)
    return jsonify({"label":str(label)})

if __name__ == '__main__':
    app.run()  # If address is in use, may need to terminate other sessions:
               # Runtime > Manage Sessions > Terminate Other Sessions
import threading
threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':80}).start() 

INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://e6f2-34-67-224-129.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
{'content': '안녕 안녕'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:40:23] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '오늘은 날씨가 맑네요!'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:40:51] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '비가 주룩주룩 내 마음도 주룩주룩'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:41:01] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '날씨가 더워서 노트북이 맛이 갔어요 좀 짜증…'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:41:18] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '왜 이렇게 힘이 들까… 사회란 힘들다 ㅠㅠ 흑흑'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:41:31] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '너와 함께한 시간 모두 눈부셨다. 날이 좋아서 날이 좋지 않아서 날이 적당해서 모든 날이 좋았다. '}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:52:44] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '너와 함께한 시간 모두 눈부셨다. 날이 좋아서 날이 좋지 않아서 날이 적당해서 모든 날이 좋았다. '}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:52:46] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '싸늘하다. 가슴에 비수가 날아와 꽂힌다. 하지만 걱정하지 마라. 손은 눈보다 빠르니까. 아귀한테 밑에서 한 장, 정 마담도 밑에서 한 장, 나 한 장. 아귀한테 다시 밑에서 한 장, 이제 정 마담에게 마지막 한 장..'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:54:30] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '내 드릴은 하늘을 뚫을 드릴이다'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:55:40] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '맷돌에 뭘 갈려고 집어넣고 맷돌 돌리려고 하는데, (만년필을 뚜껑에서 뾱 꺼내며) 손잡이가 빠졌네? 이런 상황을 어이가 없다 그래요. 황당하잖아. 아무 것도 아닌 손잡이 땜에 해야 될 일을 못하니까.'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:58:22] "[37mPOST /analysis HTTP/1.1[0m" 200 -


{'content': '맷돌에 뭘 갈려고 집어넣고 맷돌 돌리려고 하는데, 손잡이가 빠졌네? 이런 상황을 어이가 없다 그래요.'}


INFO:werkzeug:127.0.0.1 - - [01/Sep/2022 01:58:46] "[37mPOST /analysis HTTP/1.1[0m" 200 -
