In [None]:
%pip install opencc-python-reimplemented

In [None]:
import time
import re
import jieba
import os
import re
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from opencc import OpenCC
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


#### Train Chinese word2vec via wiki(cpu)

In [None]:
!wget https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2

In [None]:
input_filename = "zhwiki-latest-pages-articles.xml.bz2"
output_filename = (
    "./word2vec-model/wiki-preprocessed-raw.txt"  # TODO. modify it when add dataset
)

wiki = WikiCorpus(input_filename, dictionary={})
cc = OpenCC("s2t")
relu = re.compile(r"[ a-zA-Z]")

start = time.time()
with open(output_filename, "w", encoding="UTF-8") as output_f:
    for index, text in enumerate(wiki.get_texts()):
        # an article
        art = " ".join(text)

        # simplify to traditional
        art = cc.convert(art)

        lines = []
        for tts in text:
            for tt in tts.split("\n"):
                lines.append(tt)

        # delete english char and blank
        for line in lines:
            line = relu.sub("", line)
            seg_list = jieba.cut(line)
            seg_res = " ".join(seg_list)
            # output
            output_f.write(seg_res)

        if index % 10000 == 0:
            print("Saved " + str(index) + "articles")

print(f"Finished preprocessed data in {time.time() - start} seconds")


In [None]:
# train # TODO. cannot train this on kaggle because of not enough ram(not gpu)
input_text_path = (
    "./word2vec-model/wiki-preprocessed-raw.txt"  # TODO. modify it when add dataset
)
output_model_path = (
    "./word2vec-model/wiki-out-model"  # TODO. modify it when add dataset
)
sentences = LineSentence(input_text_path)  # 將剛剛寫的檔案轉換成 iterable

print("Training word2vec model")
# model = Word2Vec(sentences, vector_size=256, epochs=8, sg=0, window=10, workers=24)
model = Word2Vec(sentences, workers=multiprocessing.cpu_count())
# model = Word2Vec(sentences, size=4,window=5, min_count=5, workers=multiprocessing.cpu_count())
print("Save the model...")
model.save(output_model_path)


In [None]:
# test
model = Word2Vec.load(
    "./word2vec-model/wiki-out-model"
)  # TODO. modify it when add dataset
result = model.wv.most_similar("男人")
for e in result:
    print(e)


#### Train bert classifier(forget to use chinese bert to train)

- `Difference` use word2vec to grouping tags
  - difficulty
    - wiki's chinese data is too small
    - TA's tags are not in word set that can't be compared(no effort)
    - forget to remember O() that is would cause about O(n^2) to finish it, which is a long time
- `Difference` handle with http(s), numbers(include date) before training
  - difficulty
    - after `re` many col turns into `''` that I have no time to test that is it better than not using `re`

In [None]:
# constants
dataset_dir = "./Data"  # TODO. add it when add dataset
cc = OpenCC("s2t")
relu = re.compile(r"-?\d*\.?\d+")  # dealwith number

# word2vec
word2vec_model = Word2Vec.load(
    "./word2vec-model/wiki-out-model"
)  # TODO. modify it when add dataset
word_set = set(word2vec_model.wv.index_to_key)

# bert
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
batch_size = 32
epochs = 10
learning_rate = 5e-5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
# word2vec # TODO. has no function
def count_similarity(word1, word2):
    if word1 == word2:
        return 1
    elif word1 in word_set and word2 in word_set:
        return word2vec_model.wv.similarity(word1, word2)
    else:
        return 0


In [None]:
# handle input data
dataset = []
tags = []


def add_dataset(tag_idx, df1, df2, col1, col2):
    for item in df1[col1].values.tolist():
        item = str(item)
        # get out http(s) => left only http
        if (len(item) >= 4 and item[:4] == "http") or (
            len(item) >= 4 and item[:4] == "http"
        ):  # http
            item = "http"
        else:  # get out number, date => left only without numbers
            item = cc.convert(item)
            item = relu.sub("", item)

        dataset[tag_idx].append(item)

    for item in df2[col2].values.tolist():
        item = str(item)
        # get out http(s) => left only http
        if (len(item) >= 4 and item[:4] == "http") or (
            len(item) >= 4 and item[:4] == "http"
        ):  # http
            item = "http"
        else:  # get out number, date => left only without numbers
            item = cc.convert(item)
            item = relu.sub("", item)

        dataset[tag_idx].append(item)


start = time.time()
for (dirpath, dirnames, filenames) in os.walk(dataset_dir):
    if "mapping.txt" in filenames:
        print("adding data...", dirpath)

        with open(os.path.join(dirpath, "mapping.txt"), "r", encoding="UTF-8") as f:
            # Table1.csv
            df1 = pd.read_csv(os.path.join(dirpath, "Table1.csv"))
            df1 = df1.dropna()
            # Table2.csv
            df2 = pd.read_csv(os.path.join(dirpath, "Table2.csv"))
            df2 = df2.dropna()

            for line in f.readlines():
                words = line.split(",")
                word1 = words[0][1:]
                word2 = words[1].split("\n")[0][1:-1]
                cc_word1 = cc.convert(word1)
                cc_word2 = cc.convert(word2)

                # find similarity
                max_avg = 0
                max_idx = -1

                for i in range(len(tags)):
                    sim = 0

                    for tag in tags[i]:
                        sim1 = count_similarity(tag, cc_word1)
                        sim2 = count_similarity(tag, cc_word2)
                        sim += (sim1 + sim2) / 2

                    sim /= len(tags[i])
                    if max_avg > sim:
                        max_avg = sim
                        max_idx = i

                if max_avg >= 0.55:
                    # add to same tag
                    tags[max_idx].append(cc_word1)
                    tags[max_idx].append(cc_word2)
                    add_dataset(max_idx, df1, df2, word1, word2)
                else:
                    tags.append([cc_word1, cc_word2])
                    dataset.append([])
                    add_dataset(len(tags) - 1, df1, df2, word1, word2)

print(f"import dataset in {time.time() - start} secs")
print("tag_grouping")
print(tags)


In [None]:
print(len(dataset))

In [None]:
# bert
class BertDataset(Dataset):
    def __init__(self, dataset):
        #   def __init__(self, dataset_path):
        #     self.path = dataset_path
        self.dataset = dataset  # include indexes & data_groups
        self.idx = None
        self.att = None
        # self.sent_dict = {'Positive':0, 'Negative':1, 'Neutral':2}
        # self.sent_dict_ = {0:'Positive', 1:'Negative', 2:'Neutral'}
        self.label = None

        self.process()

    def process(self):
        self.label = []
        self.idx = []
        self.att = []

        for i in range(len(self.dataset)):
            data = self.dataset[i]
            for text in tqdm(data):
                result = tokenizer(
                    text,
                    padding="max_length",
                    max_length=256,
                    truncation=True,
                    return_tensors="pt",
                )
                self.idx.append(result["input_ids"])
                self.att.append(result["attention_mask"])
                self.label.append(i)  # check via tags' index

        return

    def __getitem__(self, index):
        idx = self.idx[index]  # .flatten()
        att = self.att[index]  # .flatten()
        y = self.label[index]
        y = torch.tensor(y, dtype=torch.long)

        return {"input_ids": idx, "attention_mask": att, "y": y}

    def __len__(self):
        return len(self.label)


class BertClassifier(nn.Module):
    def __init__(self, num_class, model_name):
        super(BertClassifier, self).__init__()
        self.num_class = num_class
        self.model_name = model_name

        self.bert = BertModel.from_pretrained(self.model_name)
        self.dense = nn.Linear(self.bert.config.hidden_size, self.num_class)

    # Define how your model pass data
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # keys(): ['last_hidden_state', 'pooler_output']

        outputs = outputs["pooler_output"]  # shape: (batch, hidden_size)
        logits = self.dense(outputs)  # shape: (batch, num_class)

        return logits, F.softmax(logits, dim=1)


In [None]:
# train bert setting
train_set = BertDataset(dataset)
train_loader = DataLoader(train_set, num_workers=2, batch_size=batch_size)

model = BertClassifier(len(dataset), model_name).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)


In [None]:
# train bert
start = time.time()

for ep in range(epochs):
    print("======== Epoch {:} / {:} ========".format(ep + 1, epochs))

    model.train()
    t0 = time.time()
    tr_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        idx = batch["input_ids"].squeeze(dim=1).to(device)
        att = batch["attention_mask"].squeeze(dim=1).to(device)
        y = batch["y"].to(device)

        logits, prob = model.forward(idx, att)
        # ---------------------------------
        #  LOSS evaluation
        # ---------------------------------
        loss = loss_fn(logits, y)

        # ---------------------------------
        #  OPTIMIZATION
        # ---------------------------------
        # Calculate weigth updates
        loss.backward()
        # Apply modifications
        optimizer.step()

        tr_loss += loss.item()

    # Calculate the average loss over all of the batches.
    avg_tr_loss = tr_loss / len(train_loader)
    # Measure how long this epoch took.
    #   training_time = format_time(time.time() - t0)

    print("")
    print("Average training loss: {0:.3f}".format(avg_tr_loss))
    print("Training epcoh took: {:}".format(time.time() - t0))

print(f"training bert in {time.time() - start} secs")

model_save_path = "./bert-model/bert-model.pt"
print("Save model...")
torch.save(model.state_dict(), model_save_path)
print("model saved")


#### Test classifier in all datasets

```python
print(len(dataset))
# 70
```

In [None]:
# same above
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dataset_dir = "./Data"
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [None]:
# same above
class BertClassifier(nn.Module):
    def __init__(self, num_class, model_name):
        super(BertClassifier, self).__init__()
        self.num_class = num_class
        self.model_name = model_name

        self.bert = BertModel.from_pretrained(self.model_name)
        self.dense = nn.Linear(self.bert.config.hidden_size, self.num_class)

    # Define how your model pass data
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # keys(): ['last_hidden_state', 'pooler_output']

        outputs = outputs["pooler_output"]  # shape: (batch, hidden_size)
        logits = self.dense(outputs)  # shape: (batch, num_class)

        return logits, F.softmax(logits, dim=1)


In [None]:
# same above
# load model
model = BertClassifier(len(dataset), model_name).to(device)
model.load_state_dict(torch.load("./bert-model/bert-model.pt", map_location=device))
model.eval()


In [None]:
def get_classifier_val(text):
    encoded_input = tokenizer(
        text,
        padding="max_length",
        max_length=256,
        truncation=True,
        return_tensors="pt",
    )

    logits, softmaxed = model(
        encoded_input["input_ids"].squeeze(dim=1).to(device),
        encoded_input["attention_mask"].squeeze(dim=1).to(device),
    )

    return softmaxed[0].detach().numpy()
