In [None]:
import pandas as pd
from tqdm import tqdm
import re
import torch
import numpy as np
from pytorch_pretrained_bert import BertTokenizer

In [None]:
path = "BlurbGenreCollection_EN_train.txt"
with open(path, 'r') as f:
    lines = f.readlines()

In [None]:
r1 = r'<title>(.*?)</title>\n'
r2 = r'<body>(.*?)</body>\n'
r31 = r'<d0>(.*?)</d0>'
r32 = r'<d1>(.*?)</d1>'
r33 = r'<d2>(.*?)</d2>'
r34 = r'<d3>(.*?)</d3>'

In [None]:
data = pd.DataFrame()
data['title'] = ['']*len(lines)
data['abstract'] = None
data['genre1'] = None
data['genre2'] = None
data['genre3'] = None
data['genre4'] = None

In [None]:
index = 0
for i in tqdm(range(len(lines))):
    
    if lines[i][1:6]=='title':
        title = re.findall(r1,lines[i])
        data['title'][index] = title[0]
    if lines[i][1:5]=='body':
        body = re.findall(r2,lines[i])
        data['abstract'][index] = body[0]
    if lines[i]=='<topics>\n':
        genres = lines[i+1]
        genre1 = re.findall(r31,genres)
        genre2 = re.findall(r32,genres)
        genre3 = re.findall(r33,genres)
        genre4 = re.findall(r34,genres)
        data['genre1'][index] = genre1
        data['genre2'][index] = genre2
        data['genre3'][index] = genre3
        data['genre4'][index] = genre4
        index += 1

In [None]:
data = data[0:index]

In [None]:
data.to_csv("train_data.csv", index=False)

In [None]:
labels_1 = []
for i in range(index):
    for j in data["genre1"][i]:
        if j not in labels_1:
            labels_1.append(j)

In [None]:
num_1 = len(labels_1)

In [None]:
labels_2 = []
for i in range(index):
    for j in data["genre2"][i]:
        if j not in labels_2:
            labels_2.append(j)

In [None]:
num_2 = len(labels_2)

In [None]:
labels_3 = []
for i in range(index):
    for j in data["genre3"][i]:
        if j not in labels_3:
            labels_3.append(j)

In [None]:
num_3 = len(labels_3)

In [None]:
labels_4 = []
for i in range(index):
    for j in data["genre4"][i]:
        if j not in labels_4:
            labels_4.append(j)

In [None]:
num_4 = len(labels_4)

In [None]:
num_all = num_1+num_2+num_3+num_4

In [None]:
labels_all = labels_1+labels_2+labels_3+labels_4

In [None]:
path = "hierarchy.txt"
with open(path, 'r') as f:
    lines = f.readlines()

In [None]:
mask = torch.zeros(num_all, 3*num_all)

In [None]:
# parent-child relations
for line in lines[:-2]:
    l = line.split("\t")[0]
    r = line.split("\t")[1].split("\n")[0]
    if l in labels_all and r in labels_all: 
        mask[labels_all.index(r),labels_all.index(l)] = 1
        mask[labels_all.index(l),num_all+labels_all.index(r)] = 1

In [None]:
# sibling relations
for i in range(num_1,num_all):
    parents = np.where(np.array(mask[i, 0:num_all]))[0]
    for parent in parents:
        siblings = np.where(mask[:,parent])[0]
        for sibling in siblings:
            mask[i, 2*num_all+sibling] = 1

# self-connection for first level
for i in range(0, num_1):
    mask[i, 2*num_all+i] = 1

In [None]:
mask = mask[:,0:num_all] + mask[:,num_all:2*num_all] + mask[:,2*num_all:3*num_all]

In [None]:
torch.save(mask, "bgc_att_mask.pt")

In [None]:
data

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # fetch the tokenizer

In [None]:
train_tokens = []
for i in tqdm(range(len(data))):
    text = data["title"][i] + " " + data["abstract"][i]
    temp = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)[0:100])
    train_tokens.append([101]+temp+[102]) # [CLS] at the beginning, [SEP] at the end
    
max_len = max(len(l) for l in train_tokens)

for i,l in enumerate(train_tokens):
    train_tokens[i] = l + (max_len-len(l))*[0] # zero-pad to the same length 

In [None]:
torch.save(torch.tensor(train_tokens), "bgc_train_x.pt")

In [None]:
train_labels = torch.zeros(len(data), num_all)
for i in tqdm(range(len(data))):
    labels = data["genre1"][i] + data["genre2"][i] + data["genre3"][i] + data["genre4"][i]
    for label in labels: 
        train_labels[i,labels_all.index(label)] = 1

In [None]:
torch.save(torch.tensor(train_labels), "bgc_train_y.pt")

In [None]:
labels = pd.DataFrame()
labels["label"] = labels_all

In [None]:
labels.to_csv("labels.csv", index=False)

In [None]:
num_1,num_2,num_3,num_4 = 7,46,77,16
num_all = num_1+num_2+num_3+num_4

In [None]:
train_labels = torch.load("bgc_train_y.pt")

In [None]:
fre12 = torch.zeros(num_1, num_2)
for i in range(num_1):
    subset = train_labels[train_labels[:,i]==1]
    fre12[i,:] = subset.sum(axis=0)[num_1:(num_1+num_2)]/subset.shape[0]
torch.save(fre12, "bgc_fre12.pt")

In [None]:
fre23 = torch.zeros(num_2, num_3)
for i in range(num_2):
    subset = train_labels[train_labels[:,num_1+i]==1]
    fre23[i,:] = subset.sum(axis=0)[(num_1+num_2):(num_1+num_2+num_3)]/subset.shape[0]
torch.save(fre23, "bgc_fre23.pt")

In [None]:
fre34 = torch.zeros(num_3, num_4)
for i in range(num_3):
    subset = train_labels[train_labels[:,num_1+num_2+i]==1]
    fre34[i,:] = subset.sum(axis=0)[(num_1+num_2+num_3):num_all]/subset.shape[0]
torch.save(fre23, "bgc_fre34.pt")