In [1]:
import sys
import numpy as np
import random as rn
import pandas as pd
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn

from pytorch_pretrained_bert import BertTokenizer
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import torch.optim as optim

from IPython.display import clear_output
from transformers import AutoTokenizer, AutoModelForMaskedLM

import sqlite3
import jieba
import jieba.posseg as pseg

from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [3]:
import pathlib
pathlib.Path().resolve() # CHECK ROUTE

WindowsPath('C:/Users/HackerByeBye/Documents/Therapy-Chatbot-Deploying-NLP/Training')

In [4]:
train_data = pd.read_csv('../train_PTT.csv')
test_data = pd.read_csv('../test_PTT.csv')

In [5]:
train_data = train_data.to_dict(orient='records')
test_data = test_data.to_dict(orient='records')
len(train_data), len(test_data)

(55824, 13956)

In [6]:
train_texts, train_labels = list(zip(*map(lambda d: (d['content'], d['label']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['content'], d['label']), test_data)))

four_train_labels, four_test_labels = [], []

for x in train_labels:
    if x=="['1']":
        four_train_labels.append(0)
    elif  x=="['2']":
        four_train_labels.append(1)
    elif x=="['3']":
        four_train_labels.append(2)
    elif  x=="['4']":
        four_train_labels.append(3)
        
for x in test_labels:
    if x==1:
        four_test_labels.append(0)
    elif x==2:
        four_test_labels.append(1)
    elif x==3:
        four_test_labels.append(2)
    elif x==4:
        four_test_labels.append(3)
        
len(four_train_labels),len(four_test_labels)

(55824, 13956)

In [7]:
def jiebaSlice(content,mode):
    stopword_set = []
    content = str(content)
    with open('../stopword.txt','r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.append(stopword.strip('\n'))
    
    content = content.strip('\n')
    if mode == "POSSEG":
        words = pseg.cut(content,use_paddle=True)
        slicedWords = []
        for word, flag in words:
#             if word not in stopword_set:
            slicedWords.append(word)
        return slicedWords
    elif mode == "CUT_HMM":
        seg_list = jieba.cut(content,HMM=True,cut_all=True)
        slicedWords = list(seg_list)
        return slicedWords
    elif mode == "CUT_FOR_SEARCH":
        seg_list = jieba.cut_for_search(content,HMM=True)
        slicedWords = list(seg_list)
        return slicedWords
    elif mode == "NORMAL":
        seg_list = jieba.cut_for_search(content)
        slicedWords = list(seg_list)
        return slicedWords  

In [8]:
train_tokens = list(map(lambda t: ['[CLS]'] + jiebaSlice(t,'CUT_HMM') + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + jiebaSlice(t,'CUT_HMM') + ['[SEP]'], test_texts))

tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)

train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HACKER~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.533 seconds.
Prefix dict has been built successfully.
'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-chinese/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001A72C33E148>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/bert-base-chinese/resolve/main/tokenizer_config.json


In [9]:
train_y = np.array(four_train_labels)
test_y = np.array(four_test_labels) 
train_y.shape, test_y.shape

((55824,), (13956,))

In [10]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [11]:
class BertBasicClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBasicClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 4)

    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return F.log_softmax(linear_output)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
BATCH_SIZE = 1
EPOCHS = 50

In [14]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [15]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [16]:
bert_clf = BertBasicClassifier()
bert_clf = bert_clf.cuda()
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [17]:
import os 
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.cuda.empty_cache()

In [None]:
prev_train_acc = 0
epoch_loss = []
accuracy_list = []
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    correct = 0
    
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        squeeze_labels = torch.squeeze(labels,1)
        long_labels = squeeze_labels.to(torch.long)
        
        logits = bert_clf(token_ids, masks)

        loss_func = nn.CrossEntropyLoss()

        batch_loss = loss_func(logits, long_labels) #計算loss
        train_loss += batch_loss.item()
        
        bert_clf.zero_grad() #清空前一次的gradient
        batch_loss.backward() #根據loss進行back propagation，計算gradient
        
        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step() #做gradient descent
        
        numpy_logits = logits.cpu().detach().numpy()
        preditct = list(numpy_logits[0, :]).index(max(list(numpy_logits[0, :])))
        
        if preditct == int(labels):
            correct += 1
        
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
        accuracy = 100 * correct / len(train_texts) 
        print("Acc. "+ str(accuracy))
        final_epoch_loss = train_loss / (step_num + 1)
        print('Previous Train Acc: ', prev_train_acc)
    prev_train_acc = accuracy
    epoch_loss.append(final_epoch_loss)
    accuracy_list.append(accuracy)

Epoch:  14
54106/55824.0 loss: 0.2401265862915101 
Acc. 93.45263685869877
Previous Train Acc:  96.01246775580395


In [None]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        labels = torch.squeeze(labels,1)
        labels = labels.to(torch.long)
        logits = bert_clf(token_ids, masks)
#         loss_func = nn.BCELoss()
        loss_func = nn.CrossEntropyLoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        print("\r" + "{0}/{1}".format(step_num, len(test_data) / BATCH_SIZE))
        bert_predicted.append(list(numpy_logits[0, :]).index(max(list(numpy_logits[0, :]))))
        all_logits += list(numpy_logits[:, 0])

In [None]:
print(classification_report(test_y, bert_predicted))

In [None]:
print(epoch_loss)

In [None]:
print(accuracy_list)