In [None]:
%pip install opencc-python-reimplemented

In [None]:
import time
import os
import re
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from opencc import OpenCC


In [None]:
# same above
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dataset_dir = "./Data"
model_name = "bert-base-multilingual-cased"
model_loc = "./bert-model/bert-model.pt"
model_classes = 70  # len(dataset)
tokenizer = BertTokenizer.from_pretrained(model_name)
ans_dir = "./Result"
cc = OpenCC("s2t")
MAX_COMPARED_WORDS = 200


In [None]:
# same above
class BertClassifier(nn.Module):
    def __init__(self, num_class, model_name):
        super(BertClassifier, self).__init__()
        self.num_class = num_class
        self.model_name = model_name

        self.bert = BertModel.from_pretrained(self.model_name)
        self.dense = nn.Linear(self.bert.config.hidden_size, self.num_class)

    # Define how your model pass data
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # keys(): ['last_hidden_state', 'pooler_output']

        outputs = outputs["pooler_output"]  # shape: (batch, hidden_size)
        logits = self.dense(outputs)  # shape: (batch, num_class)

        return logits, F.softmax(logits, dim=1)


In [None]:
# same above
# load model
model = BertClassifier(model_classes, model_name).to(device)
model.load_state_dict(torch.load(model_loc, map_location=device))
model.eval()


In [None]:
!mkdir Result

In [None]:
def get_classifier_val(text):
    encoded_input = tokenizer(
        text,
        padding="max_length",
        max_length=256,
        truncation=True,
        return_tensors="pt",
    )

    logits, softmaxed = model(
        encoded_input["input_ids"].squeeze(dim=1).to(device),
        encoded_input["attention_mask"].squeeze(dim=1).to(device),
    )

    return softmaxed[0].cpu().detach().numpy()


def cal_vec(word_list):
    _list = []
    counter = 0

    for word in word_list:
        _list.append(get_classifier_val(cc.convert(str(word))))

        counter += 1
        if counter >= MAX_COMPARED_WORDS:
            break

    vec = np.mean(np.array(_list), axis=0)

    test = np.sum(vec)
    test_times = 1 / test
    vec *= test_times

    return vec


def findSuccessPair(df):
    x_tags = df.columns
    y_tags = df.index
    predicted_num = min(len(x_tags), len(y_tags))
    pairs = []
    ran_x = []
    ran_y = []

    for _ in range(predicted_num):
        max_val = 0
        max_x = ""
        max_y = ""

        for x in x_tags:  # O(n^2)
            if x in ran_x:
                continue
            for y in y_tags:
                if y in ran_y:
                    continue

                if max_val < df[x][y]:
                    max_val = df[x][y]
                    max_x = x
                    max_y = y
        if max_val != 0:
            pairs.append((max_x, max_y))
            ran_x.append(max_x)
            ran_y.append(max_y)

    return pairs


In [None]:
for (dirpath, dirnames, filenames) in os.walk(dataset_dir):
    if "mapping.txt" in filenames:
        print("accessing...", dirpath)

        mapping = []

        with open(os.path.join(dirpath, "mapping.txt"), "r", encoding="UTF-8") as f:
            for line in f.readlines():
                words = line.split(",")
                mapping.append((words[0][1:], words[1].split("\n")[0][1:-1]))

        dict1 = {}
        dict2 = {}

        # Table1.csv
        df1 = pd.read_csv(os.path.join(dirpath, "Table1.csv"), low_memory=False)
        df1 = df1.dropna(how="all").fillna("")
        # Table2.csv
        df2 = pd.read_csv(os.path.join(dirpath, "Table2.csv"), low_memory=False)
        df2 = df2.dropna(how="all").fillna("")

        # my generated pd
        cols = []
        for col in df1.columns:
            cols.append(col)

        rows = []
        row_index = []

        for row in df2.columns:
            a_row = []

            row_index.append(row)
            for col in df1.columns:
                if col not in dict1.keys():
                    vec = cal_vec(df1[col])
                    dict1[col] = vec
                if row not in dict2.keys():
                    vec = cal_vec(df2[row])
                    dict2[row] = vec
                val = np.dot(dict1[col], dict2[row])
                a_row.append(val)
            rows.append(a_row)

        df = pd.DataFrame(rows, columns=cols, index=row_index)

        # write pair mapping
        if len(rows) < 10:
            base = len(rows)
            for i in range(base, 10):
                rows.append(["" for _ in range(len(cols))])
                row_index.append("")

        cols.append("")
        cols.append("最佳配對")
        rows[0].append("")
        rows[0].append("Table1")
        rows[1].append("")
        rows[1].append("Table2")
        rows[3].append("")
        rows[3].append("successful pair")
        rows[4].append("")
        rows[4].append("Table1")
        rows[5].append("")
        rows[5].append("Table2")
        rows[7].append("")
        rows[7].append("failed pair")
        rows[8].append("")
        rows[8].append("Table1")
        rows[9].append("")
        rows[9].append("Table2")

        ## alg to find pair: get col_name and row_name
        pairs = findSuccessPair(df)

        succ_pc = 0
        fail_pc = 0
        for i, (col_name, row_name) in enumerate(pairs):
            # appendSuccessPair(): row[1], row[2], col_name, row_name, col_tag(string, f"pair_{index}")
            cols.append(f"pair_{i + 1}")
            rows[0].append(col_name)
            rows[1].append(row_name)

            if (col_name, row_name) in mapping:
                succ_pc += 1
                rows[3].append(f"pair_{succ_pc}")
                rows[4].append(col_name)
                rows[5].append(row_name)
            else:
                fail_pc += 1
                rows[7].append(f"pair_{fail_pc}")
                rows[8].append(col_name)
                rows[9].append(row_name)

        df = pd.DataFrame(rows, columns=cols, index=row_index)
        df.to_csv(
            os.path.join(
                ans_dir,
                "result_{}.csv".format(
                    re.search("\d+$", dirpath.split("\\")[-1]).group(0)
                ),
            )
        )
