runs successfully on kaggle


In [None]:
%pip install opencc-python-reimplemented

In [None]:
import time
import os
import re
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from opencc import OpenCC


#### Train bert classifier(forget to use chinese bert to train)


In [None]:
# constants
dataset_dir = "./Data"  # TODO. add it when add dataset
cc = OpenCC("s2t")
relu = re.compile(r"-?\d*\.?\d+")  # dealwith number

# bert
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
batch_size = 32
epochs = 10
learning_rate = 5e-5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_save_path = "./bert-model/bert-model.pt"


In [None]:
# handle input data
dataset = []

start = time.time()

for (dirpath, dirnames, filenames) in os.walk(dataset_dir):
    if "mapping.txt" in filenames:
        print("adding data...", dirpath)

        mapping = []

        with open(os.path.join(dirpath, "mapping.txt"), "r", encoding="UTF-8") as f:
            for line in f.readlines():
                words = line.split(",")
                mapping.append((words[0][1:], words[1].split("\n")[0][1:-1]))
        # print(mapping) # ok

        # Table1.csv
        df1 = pd.read_csv(os.path.join(dirpath, "Table1.csv"), low_memory=False)
        df1 = df1.dropna()
        # Table2.csv
        df2 = pd.read_csv(os.path.join(dirpath, "Table2.csv"), low_memory=False)
        df2 = df2.dropna()

        for col1, col2 in mapping:
            new_group = []

            for item in df1[col1].values.tolist():
                item = str(item)
                #                 if (len(item) >= 4 and item[:4] == "http")or (len(item) >= 4 and item[:4] == "http"): # http
                #                     item = "http"
                #                 else: # remove number include date
                #                     item = cc.convert(item)
                #                     item = relu.sub('', item)
                new_group.append(cc.convert(item))

            for item in df2[col2].values.tolist():
                item = str(item)
                #                 if (len(item) >= 4 and item[:4] == "http")or (len(item) >= 4 and item[:4] == "http"): # http
                #                     item = "http"
                #                 else: # remove number include date
                #                     item = cc.convert(item)
                #                     item = relu.sub('', item)
                new_group.append(cc.convert(item))

            # print(new_group) # ok # WARNING string type, content
            dataset.append(new_group)


print(f"import dataset in {time.time() - start} secs")


In [None]:
print(len(dataset))

model_classes = len(dataset)


In [None]:
# bert
class BertDataset(Dataset):
    def __init__(self, dataset):
        #   def __init__(self, dataset_path):
        #     self.path = dataset_path
        self.dataset = dataset  # include indexes & data_groups
        self.idx = None
        self.att = None
        # self.sent_dict = {'Positive':0, 'Negative':1, 'Neutral':2}
        # self.sent_dict_ = {0:'Positive', 1:'Negative', 2:'Neutral'}
        self.label = None

        self.process()

    def process(self):
        self.label = []
        self.idx = []
        self.att = []

        for i in range(len(self.dataset)):
            data = self.dataset[i]
            for text in tqdm(data):
                result = tokenizer(
                    text,
                    padding="max_length",
                    max_length=256,
                    truncation=True,
                    return_tensors="pt",
                )
                self.idx.append(result["input_ids"])
                self.att.append(result["attention_mask"])
                self.label.append(i)  # check via tags' index

        return

    def __getitem__(self, index):
        idx = self.idx[index]  # .flatten()
        att = self.att[index]  # .flatten()
        y = self.label[index]
        y = torch.tensor(y, dtype=torch.long)

        return {"input_ids": idx, "attention_mask": att, "y": y}

    def __len__(self):
        return len(self.label)


class BertClassifier(nn.Module):
    def __init__(self, num_class, model_name):
        super(BertClassifier, self).__init__()
        self.num_class = num_class
        self.model_name = model_name

        self.bert = BertModel.from_pretrained(self.model_name)
        self.dense = nn.Linear(self.bert.config.hidden_size, self.num_class)

    # Define how your model pass data
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # keys(): ['last_hidden_state', 'pooler_output']

        outputs = outputs["pooler_output"]  # shape: (batch, hidden_size)
        logits = self.dense(outputs)  # shape: (batch, num_class)

        return logits, F.softmax(logits, dim=1)


In [None]:
# train bert setting
train_set = BertDataset(dataset)
train_loader = DataLoader(train_set, num_workers=2, batch_size=batch_size)

model = BertClassifier(model_classes, model_name).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)


In [None]:
!mkdir bert-model

In [None]:
# train bert
start = time.time()

for ep in range(epochs):
    print("======== Epoch {:} / {:} ========".format(ep + 1, epochs))

    model.train()
    t0 = time.time()
    tr_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        idx = batch["input_ids"].squeeze(dim=1).to(device)
        att = batch["attention_mask"].squeeze(dim=1).to(device)
        y = batch["y"].to(device)

        logits, prob = model.forward(idx, att)
        # ---------------------------------
        #  LOSS evaluation
        # ---------------------------------
        loss = loss_fn(logits, y)

        # ---------------------------------
        #  OPTIMIZATION
        # ---------------------------------
        # Calculate weigth updates
        loss.backward()
        # Apply modifications
        optimizer.step()

        tr_loss += loss.item()

    # Calculate the average loss over all of the batches.
    avg_tr_loss = tr_loss / len(train_loader)
    # Measure how long this epoch took.
    #   training_time = format_time(time.time() - t0)

    print("")
    print("Average training loss: {0:.3f}".format(avg_tr_loss))
    print("Training epcoh took: {:}".format(time.time() - t0))

print(f"training bert in {time.time() - start} secs")

print("Save model...")
torch.save(model.state_dict(), model_save_path)
print("model saved")


#### Test classifier in all datasets


```python
print(len(dataset))
# 70
```


In [None]:
# same above
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dataset_dir = "./Data"
model_name = "bert-base-multilingual-cased"
model_loc = "./bert-model/bert-model.pt"
model_classes = 70  # len(dataset)
tokenizer = BertTokenizer.from_pretrained(model_name)
ans_dir = "./Result"
cc = OpenCC("s2t")
MAX_COMPARED_WORDS = 200


In [None]:
# same above
class BertClassifier(nn.Module):
    def __init__(self, num_class, model_name):
        super(BertClassifier, self).__init__()
        self.num_class = num_class
        self.model_name = model_name

        self.bert = BertModel.from_pretrained(self.model_name)
        self.dense = nn.Linear(self.bert.config.hidden_size, self.num_class)

    # Define how your model pass data
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # keys(): ['last_hidden_state', 'pooler_output']

        outputs = outputs["pooler_output"]  # shape: (batch, hidden_size)
        logits = self.dense(outputs)  # shape: (batch, num_class)

        return logits, F.softmax(logits, dim=1)


In [None]:
# same above
# load model
model = BertClassifier(model_classes, model_name).to(device)
model.load_state_dict(torch.load(model_loc, map_location=device))
model.eval()


In [None]:
!mkdir Result

In [None]:
def get_classifier_val(text):
    encoded_input = tokenizer(
        text,
        padding="max_length",
        max_length=256,
        truncation=True,
        return_tensors="pt",
    )

    logits, softmaxed = model(
        encoded_input["input_ids"].squeeze(dim=1).to(device),
        encoded_input["attention_mask"].squeeze(dim=1).to(device),
    )

    return softmaxed[0].cpu().detach().numpy()


def cal_vec(word_list):
    _list = []
    counter = 0

    for word in word_list:
        _list.append(get_classifier_val(cc.convert(str(word))))

        counter += 1
        if counter >= MAX_COMPARED_WORDS:
            break

    vec = np.mean(np.array(_list), axis=0)

    test = np.sum(vec)
    test_times = 1 / test
    vec *= test_times

    return vec


def findSuccessPair(df):
    x_tags = df.columns
    y_tags = df.index
    predicted_num = min(len(x_tags), len(y_tags))
    pairs = []
    ran_x = []
    ran_y = []

    for _ in range(predicted_num):
        max_val = 0
        max_x = ""
        max_y = ""

        for x in x_tags:  # O(n^2)
            if x in ran_x:
                continue
            for y in y_tags:
                if y in ran_y:
                    continue

                if max_val < df[x][y]:
                    max_val = df[x][y]
                    max_x = x
                    max_y = y
        if max_val != 0:
            pairs.append((max_x, max_y))
            ran_x.append(max_x)
            ran_y.append(max_y)

    return pairs


In [None]:
for (dirpath, dirnames, filenames) in os.walk(dataset_dir):
    if "mapping.txt" in filenames:
        print("accessing...", dirpath)

        mapping = []

        with open(os.path.join(dirpath, "mapping.txt"), "r", encoding="UTF-8") as f:
            for line in f.readlines():
                words = line.split(",")
                mapping.append((words[0][1:], words[1].split("\n")[0][1:-1]))

        dict1 = {}
        dict2 = {}

        # Table1.csv
        df1 = pd.read_csv(os.path.join(dirpath, "Table1.csv"), low_memory=False)
        df1 = df1.dropna(how="all").fillna("")
        # Table2.csv
        df2 = pd.read_csv(os.path.join(dirpath, "Table2.csv"), low_memory=False)
        df2 = df2.dropna(how="all").fillna("")

        # my generated pd
        cols = []
        for col in df1.columns:
            cols.append(col)

        rows = []
        row_index = []

        for row in df2.columns:
            a_row = []

            row_index.append(row)
            for col in df1.columns:
                if col not in dict1.keys():
                    vec = cal_vec(df1[col])
                    dict1[col] = vec
                if row not in dict2.keys():
                    vec = cal_vec(df2[row])
                    dict2[row] = vec
                val = np.dot(dict1[col], dict2[row])
                a_row.append(val)
            rows.append(a_row)

        df = pd.DataFrame(rows, columns=cols, index=row_index)

        # write pair mapping
        if len(rows) < 10:
            base = len(rows)
            for i in range(base, 10):
                rows.append(["" for _ in range(len(cols))])
                row_index.append("")

        cols.append("")
        cols.append("最佳配對")
        rows[0].append("")
        rows[0].append("Table1")
        rows[1].append("")
        rows[1].append("Table2")
        rows[3].append("")
        rows[3].append("successful pair")
        rows[4].append("")
        rows[4].append("Table1")
        rows[5].append("")
        rows[5].append("Table2")
        rows[7].append("")
        rows[7].append("failed pair")
        rows[8].append("")
        rows[8].append("Table1")
        rows[9].append("")
        rows[9].append("Table2")

        ## alg to find pair: get col_name and row_name
        pairs = findSuccessPair(df)

        succ_pc = 0
        fail_pc = 0
        for i, (col_name, row_name) in enumerate(pairs):
            # appendSuccessPair(): row[1], row[2], col_name, row_name, col_tag(string, f"pair_{index}")
            cols.append(f"pair_{i + 1}")
            rows[0].append(col_name)
            rows[1].append(row_name)

            if (col_name, row_name) in mapping:
                succ_pc += 1
                rows[3].append(f"pair_{succ_pc}")
                rows[4].append(col_name)
                rows[5].append(row_name)
            else:
                fail_pc += 1
                rows[7].append(f"pair_{fail_pc}")
                rows[8].append(col_name)
                rows[9].append(row_name)

        df = pd.DataFrame(rows, columns=cols, index=row_index)
        df.to_csv(
            os.path.join(
                ans_dir,
                "result_{}.csv".format(
                    re.search("\d+$", dirpath.split("\\")[-1]).group(0)
                ),
            )
        )
