In [1]:
import sys
import numpy as np
import random as rn
import pandas as pd
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
# from torchnlp.datasets import imdb_dataset      # --> We are using our own uploaded dataset.
from pytorch_pretrained_bert import BertTokenizer
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import torch.optim as optim

from IPython.display import clear_output
from transformers import AutoTokenizer, AutoModelForMaskedLM

import sqlite3
import jieba
import jieba.posseg as pseg

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [3]:
import pathlib
pathlib.Path().resolve()

WindowsPath('C:/Users/HackerByeBye/Documents/Therapy-Chatbot-Deploying-NLP/Training')

In [4]:
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test.csv')

In [5]:
train_data = train_data.to_dict(orient='records')
test_data = test_data.to_dict(orient='records')
len(train_data), len(test_data)

(25476, 8491)

In [6]:
train_texts, train_labels = list(zip(*map(lambda d: (d['title'], d['label']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['title'], d['label']), test_data)))
type(train_labels)
len(train_texts), len(train_labels), len(test_texts), len(test_labels)
train_labels = [x-1 for x in train_labels]
test_labels = [x-1 for x in test_labels]

In [7]:
def jiebaSlice(content,mode):
    stopword_set = []
    content = str(content)
    with open('../Analyzing/stopword.txt','r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.append(stopword.strip('\n'))
    
    content = content.strip('\n')
    if mode == "POSSEG":
        words = pseg.cut(content,use_paddle=True)
        slicedWords = []
        for word, flag in words:
            if word not in stopword_set:
                slicedWords.append(word)
        return slicedWords
    elif mode == "CUT_HMM":
        seg_list = jieba.cut(content,HMM=True,cut_all=True)
        slicedWords = list(seg_list)
        return slicedWords
    elif mode == "CUT_FOR_SEARCH":
        seg_list = jieba.cut_for_search(content,HMM=True)
        slicedWords = list(seg_list)
        return slicedWords
    elif mode == "NORMAL":
        seg_list = jieba.cut_for_search(content)
        slicedWords = list(seg_list)
        return slicedWords  

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)

In [9]:
train_tokens = list(map(lambda t: ['[CLS]'] + jiebaSlice(t,'CUT_HMM') + ['[SEP]'], train_texts))

test_tokens = list(map(lambda t: ['[CLS]'] + jiebaSlice(t,'CUT_HMM') + ['[SEP]'], test_texts))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HACKER~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.516 seconds.
Prefix dict has been built successfully.


In [10]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

In [11]:
train_y = np.array(train_labels)
test_y = np.array(test_labels) 
train_y.shape, test_y.shape

((25476,), (8491,))

In [12]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [13]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [15]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_tokens_ids, train_y_tensor)
baseline_predicted = clf.predict(test_tokens_tensor)
print(classification_report(test_labels, baseline_predicted))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.12      0.18      0.15       849
           1       0.13      0.09      0.10       849
           2       0.14      0.12      0.13       849
           3       0.12      0.11      0.11       849
           4       0.14      0.12      0.13       849
           5       0.12      0.08      0.10       849
           6       0.13      0.22      0.16       849
           7       0.12      0.12      0.12       849
           8       0.13      0.15      0.14       849
           9       0.12      0.09      0.10       850

    accuracy                           0.13      8491
   macro avg       0.13      0.13      0.12      8491
weighted avg       0.13      0.13      0.12      8491



In [16]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

classif = OneVsRestClassifier(estimator=SVC(random_state=1))
baseline_model = classif.fit(train_tokens_ids, train_y_tensor)
baseline_predicted = classif.predict(test_tokens_tensor)
print(classification_report(test_labels, baseline_predicted))

              precision    recall  f1-score   support

           0       0.18      0.17      0.17       849
           1       0.21      0.20      0.20       849
           2       0.21      0.21      0.21       849
           3       0.17      0.17      0.17       849
           4       0.22      0.17      0.19       849
           5       0.14      0.21      0.17       849
           6       0.22      0.20      0.21       849
           7       0.21      0.15      0.17       849
           8       0.19      0.25      0.22       849
           9       0.20      0.16      0.18       850

    accuracy                           0.19      8491
   macro avg       0.19      0.19      0.19      8491
weighted avg       0.19      0.19      0.19      8491

