In [1]:
import os
import json
import copy
import time
import torch
import pickle
import random
import string
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from rouge import Rouge
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

In [2]:
scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# General Functions

In [3]:
def getRouge2(ref, pred, kind): # tokenized input
    try:
        return round(Rouge().get_scores(pred.lower(), ref.lower())[0]['rouge-2'][kind], 4)
    except ValueError:
        return 0.0
def getRouge1(ref, pred, kind): # tokenized input
    return Rouge().get_scores(pred.lower(), ref.lower())[0]['rouge-1'][kind]
def getRougeL(ref, pred, kind): # tokenized input
    return Rouge().get_scores(pred.lower(), ref.lower())[0]['rouge-l'][kind]

In [4]:
from rouge_metric import PerlRouge

rouge = PerlRouge(rouge_n_max=2, rouge_l=False, rouge_w=False,
    rouge_w_weight=1.2, rouge_s=False, rouge_su=True, skip_gap=4)

def get_evaluation(hypothese, references):
    scores = rouge.evaluate(hypothese, references)
    rs = dict()

    for name in scores:
        rs[name] = dict()
        for key in scores[name]:
            if "conf_int" in key:
                pass
            else:
                rs[name][key] = scores[name][key]
    return rs

# Graph Construction Functions

In [5]:
def get_Bert_vec(text, limit_len=400):
    sent = text.lower()
    input_ids = torch.tensor([tokenizer.encode(sent)])
    if input_ids.shape[1] > 256:
        edus = sent.split(' . ')
        wcnt = [len(s.split(' ')) for s in edus]
        wcnt_all = sum(wcnt)

        while wcnt_all > limit_len:
            wcnt_all -= wcnt[-1]
            edus.pop()
            wcnt.pop()

        part1, part2 = [], []
        for i, s in enumerate(edus):
            if sum(wcnt[:i]) <= wcnt_all / 2:
                part1.append(s)
            else:
                part2.append(s)

        edus = [' . '.join(part1), ' . '.join(part2)]
        input_ids = [torch.tensor([tokenizer.encode(sent)]) for sent in edus]
        with torch.no_grad():
            return torch.cat([scibert(input_ids[0])["pooler_output"], scibert(input_ids[1])["pooler_output"]],
                             dim=0)

    with torch.no_grad():
        features = scibert(input_ids)
    return features["pooler_output"]

In [6]:
def meanTokenVecs(sent, sp=0):
    return sent["spans"]

def getPositionEncoding(pos, d=768, n=10000):
    P = np.zeros(d)
    for i in np.arange(int(d/2)):
        denominator = np.power(n, 2*i/d)
        P[2*i] = np.sin(pos/denominator)
        P[2*i+1] = np.cos(pos/denominator)
    return P


def removeRedundant(text):
    text = text.lower()
    words = [w for w in text.split(' ') if w not in stop_w]
    return ' '.join(words)

def divideIntoSections(input_data):
    sent_num, edu_num = 0, 0
    paraList, paras, ids, newSentID = [], [], [], {}

    for d, doc in enumerate(input_data['docs']):
        edu_num += len(doc['sents'])
        paraList.append([])

        para, curOrgSentID = [], 0
        for s, sent in enumerate(doc['sents']):
            if sent['secid'] != curOrgSentID:  # Nếu section id thay đổi
                paraList[-1].append(' '.join(para))  # Tạo đoạn văn mới
                para, curOrgSentID = [], sent['secid']  # Cập nhật secid hiện tại
            para.append(sent['raw_sent'])  # Thêm câu vào đoạn văn hiện tại

        if para:  # Thêm đoạn văn cuối cùng vào danh sách
            paraList[-1].append(' '.join(para))

    # Xử lý đoạn văn và id
    for d, doc in enumerate(paraList):
        for p, para in enumerate(doc):
            paras.append(removeRedundant(para))
            ids.append((d, p))

    # Thay vì tính toán bằng LDA, sử dụng secid có sẵn để gán newSentID
    for d, doc in enumerate(input_data['docs']):
        for s, sent in enumerate(doc['sents']):
            newSentID[(d, sent['secid'])] = sent['secid']  # Dùng secid hiện tại luôn

    prevSentnum, sect_endsent = 0, []

    # Gán newSentID trực tiếp vào input_data['docs']
    for d, doc in enumerate(input_data['docs']):
        groupset = {}
        for s, sent in enumerate(doc['sents']):
            if newSentID[(d, sent['secid'])] not in groupset:
                groupset[newSentID[(d, sent['secid'])]] = len(groupset) + prevSentnum
            input_data['docs'][d]['sents'][s]['section_new'] = groupset[newSentID[(d, sent['secid'])]]
        prevSentnum = max(groupset.values()) + 1
        sect_endsent.append(max(groupset.values()))

    return sect_endsent, max(groupset.values()) + 1, edu_num

In [7]:
def graph_construction(input_data, label_data, threds):
    edus, eduVecs, scores, sentIDs = [], [], [], []
    sent_scores = []
    sent_text = []
    sect_endsent, sent_num, edu_num = divideIntoSections(input_data)
    sect_sent_mask = np.zeros((len(input_data['docs']), sent_num))
    sent_edu_mask = np.zeros((sent_num, edu_num))
    cur_sent, cur_edu = 0, 0

    for d, doc in enumerate(input_data['docs']):
        sect_sent_mask[d][cur_sent:sect_endsent[d] + 1] = 1
        cur_sent = sect_endsent[d] + 1
        
        sent_array = {}

        for s, sent in enumerate(doc['sents']):
            edus.append(sent['raw_sent'])
            eduVecs.append(meanTokenVecs(sent) + getPositionEncoding(d) + getPositionEncoding(s))

            rouge_score = getRouge2(label_data, sent['raw_sent'], 'p')
            scores.append(rouge_score)

            sent_edu_mask[sent['section_new'], cur_edu] = 1
            cur_edu += 1
            
            # Gom các câu có cùng `secid` vào `text_section`
            sentid = sent['secid']
            if sentid not in sent_array:
                sent_array[sentid] = ""
            sent_array[sentid] += sent['raw_sent'] + " "
        
        # Tính điểm ROUGE cho từng đoạn `text_section`
        for sentid, sent_raw in sent_array.items():
            sent_rouge_score = getRouge2(label_data, sent_raw.strip(), 'p')
            sent_scores.append(sent_rouge_score)
            sent_text.append(sent_raw)
            
    sents = sent_text

    tmp_graph = Graph(edus, sents, eduVecs, scores, sent_scores, sect_sent_mask, sent_edu_mask, label_data, threds)
    return tmp_graph

# Graph Encoder

In [8]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, hid_dim, layers=2, act=nn.LeakyReLU(), dropout_p=0.3, keep_last_layer=False):
        super(MLP, self).__init__()
        self.layers = layers
        self.act = act
        self.dropout = nn.Dropout(dropout_p)
        self.keep_last = keep_last_layer

        self.mlp_layers = nn.ModuleList([])
        if layers == 1:
            self.mlp_layers.append(nn.Linear(in_dim, out_dim))
        else:
            self.mlp_layers.append(nn.Linear(in_dim, hid_dim))
            for i in range(self.layers - 2):
                self.mlp_layers.append(nn.Linear(hid_dim, hid_dim))
            self.mlp_layers.append(nn.Linear(hid_dim, out_dim))

    def forward(self, x):
        for i in range(len(self.mlp_layers) - 1):
            x = self.dropout(self.act(self.mlp_layers[i](x)))
        if self.keep_last:
            x = self.mlp_layers[-1](x)
        else:
            x = self.act(self.mlp_layers[-1](x))
        return x

In [9]:
# borrowed from labml.ai
class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features: int, out_features: int, n_heads: int,
                 is_concat: bool = True, dropout: float = 0.6,
                 leaky_relu_negative_slope: float = 0.2):
        super().__init__()
        self.is_concat = is_concat
        self.n_heads = n_heads

        if is_concat:
            assert out_features % n_heads == 0
            self.n_hidden = out_features // n_heads
        else:
            self.n_hidden = out_features

        self.linear = nn.Linear(in_features, self.n_hidden * n_heads, bias=False)
        self.attn = nn.Linear(self.n_hidden * 2, 1, bias=False)
        self.activation = nn.LeakyReLU(negative_slope=leaky_relu_negative_slope)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
        n_nodes = h.shape[0]
        g = self.linear(h).view(n_nodes, self.n_heads, self.n_hidden)
        g_repeat = g.repeat(n_nodes, 1, 1)
        g_repeat_interleave = g.repeat_interleave(n_nodes, dim=0)
        g_concat = torch.cat([g_repeat_interleave, g_repeat], dim=-1)
        g_concat = g_concat.view(n_nodes, n_nodes, self.n_heads, 2 * self.n_hidden)
        e = self.activation(self.attn(g_concat)).squeeze(-1)
        assert adj_mat.shape[0] == 1 or adj_mat.shape[0] == n_nodes
        assert adj_mat.shape[1] == 1 or adj_mat.shape[1] == n_nodes
        assert adj_mat.shape[2] == 1 or adj_mat.shape[2] == self.n_heads

        e = e.masked_fill(adj_mat == 0, float(-1e9))
        a = self.softmax(e)
        a = self.dropout(a)
        attn_res = torch.einsum('ijh,jhf->ihf', a, g)

        if self.is_concat:
            return attn_res.reshape(n_nodes, self.n_heads * self.n_hidden)
        else:
            return attn_res.mean(dim=1)

In [10]:
class GAT(nn.Module):
    def __init__(self, in_features: int, n_hidden: int, n_classes: int, n_heads: int, dropout: float):
        super().__init__()
        self.layer1 = GraphAttentionLayer(in_features, n_hidden, n_heads, is_concat=True, dropout=dropout)
        self.activation = nn.ELU()
        self.output = GraphAttentionLayer(n_hidden, n_classes, 1, is_concat=False, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, adj_mat: torch.Tensor):
        x = x.squeeze(0)
        adj_mat = adj_mat.squeeze(0).unsqueeze(-1).bool()

        x = self.dropout(x)
        x = self.layer1(x, adj_mat)

        x = self.activation(x)
        x = self.dropout(x)
        return self.output(x, adj_mat).unsqueeze(0)

In [11]:
class StepWiseGraphConvLayer(nn.Module):
    def __init__(self, in_dim, out_dim, hid_dim, dropout_p=0.3, act=nn.LeakyReLU(), nheads=6, iter=1, final="att"):
        super().__init__()
        self.act = act
        self.dropout = nn.Dropout(dropout_p)
        self.iter = iter
        self.gat = nn.ModuleList([GAT(in_features=in_dim, n_hidden=hid_dim, n_classes=in_dim,
                                      dropout=dropout_p, n_heads=nheads) for _ in range(iter)])
        self.gat1 = nn.ModuleList([GAT(in_features=in_dim, n_hidden=hid_dim, n_classes=in_dim,
                                       dropout=dropout_p, n_heads=nheads) for _ in range(iter)])

        self.feature_fusion_layer = nn.Linear(in_dim * 2, in_dim)
        self.ffn = MLP(in_dim, in_dim, hid_dim, dropout_p=dropout_p, layers=3)
        self.out_ffn = MLP(in_dim, in_dim, hid_dim, dropout_p=dropout_p)

    def forward(self, feature, adj, sect_num):

        sent_adj = adj.clone()
        sent_adj[:, :, -sect_num:] = 0
        
        sect_adj = adj.clone()
        sect_adj[:, :, :-sect_num] = 0

        feature_sent = feature.clone()
        feature_sect = feature.clone()
        
        feature_resi = feature
        feature_sent_re = feature_sent
        feature_sect_re = feature_sect


        for i in range(0, self.iter):
            feature_sent = self.gat[i](feature_sent, sent_adj)
        feature_sent += feature_sent_re

        for i in range(0, self.iter):
            feature_sect = self.gat1[i](feature_sect, sect_adj)
        feature_sect += feature_sect_re
        
        feature = torch.concat([feature_sect, feature_sent], dim=-1)
        feature = self.dropout(F.leaky_relu(self.feature_fusion_layer(feature)))
        feature = self.ffn(feature)
        feature = self.out_ffn(feature) + feature_resi
        return feature

In [12]:
class Contrast_Encoder(nn.Module):
    def __init__(self, graph_encoder, hidden_dim, bert_hidden=768, in_dim=768, dropout_p=0.3):
        super(Contrast_Encoder, self).__init__()
        self.graph_encoder = graph_encoder
        self.common_proj_mlp = MLP(in_dim, in_dim, hidden_dim, dropout_p=dropout_p, act=nn.LeakyReLU())

    def forward(self, p_gfeature, p_adj, sect_num):
        pg = self.graph_encoder(p_gfeature.float(), p_adj.float(), sect_num)
        pg = self.common_proj_mlp(pg)
        
        return pg

In [13]:
class End2End_Encoder(nn.Module):
    def __init__(self, graph_encoder, in_dim, hidden_dim, dropout_p):
        super(End2End_Encoder, self).__init__()
        self.graph_encoder = graph_encoder
        self.dropout = nn.Dropout(dropout_p)
        self.out_proj_layer_mlp = MLP(in_dim, in_dim, hidden_dim, act=nn.LeakyReLU(), dropout_p=dropout_p, layers=2)
        self.final_layer = nn.Linear(in_dim, 1)

    def forward(self, x, adj, sect_num):
        x = self.graph_encoder(x.float(), adj.float(), sect_num)
        
        x_sent = x[:, :-sect_num, :]
        x_sent = self.out_proj_layer_mlp(x_sent)
        x_sent = self.final_layer(x_sent)
        
        return x_sent

In [14]:
def mask_to_adj(sect_sent_mask, sent_edu_mask, have_edu=True):
    sect_sent_mask = np.array(sect_sent_mask)
    sent_edu_mask = np.array(sent_edu_mask)

    edu_num = sent_edu_mask.shape[1]
    sent_num = sent_edu_mask.shape[0]
    sect_num = sect_sent_mask.shape[0]
    adj = np.zeros((edu_num + sent_num + sect_num + 1, edu_num + sent_num + sect_num + 1))
    # section connection
    adj[-sent_num - sect_num - 1:-sect_num - 1, 0:-sent_num - sect_num - 1] = sent_edu_mask
    adj[0:-sent_num - sect_num - 1, -sent_num - sect_num - 1:-sect_num - 1] = sent_edu_mask.T
    #sec_sec
    for i in range(0, sect_num):
        sect_mask = sect_sent_mask[i]

        # Đảm bảo sect_mask là mảng numpy và có chiều đúng để reshape: đảm bảo rằng sect_mask có đúng dạng để có thể nhân ma trận.
        if sect_mask.ndim == 1:
            sect_mask = sect_mask.reshape((1, -1))
        elif sect_mask.ndim == 0:
            sect_mask = np.array([sect_mask])  # Chuyển thành mảng 1D nếu là số đơn lẻ

        adj[edu_num:-sect_num - 1, edu_num:-sect_num - 1] += sect_mask * sect_mask.T #sec_sec của từng doc

    adj[-sect_num - 1:-1, -sent_num - sect_num - 1:-sect_num - 1] = sect_sent_mask
    adj[-sent_num - sect_num - 1:-sect_num - 1, -sect_num - 1:-1] = sect_sent_mask.T
    adj[-sect_num - 1: -1, -sect_num-1: -1] = 1 
    
    # build sentence connection
    for i in range(0, sent_num):
        sent_mask = sent_edu_mask[i]

        # Đảm bảo sent_mask là mảng numpy và có chiều đúng để reshape
        if sent_mask.ndim == 1:
            sent_mask = sent_mask.reshape((1, -1))
        elif sent_mask.ndim == 0:
            sent_mask = np.array([sent_mask])  # Chuyển thành mảng 1D nếu là số đơn lẻ

        adj[:edu_num, :edu_num] += sent_mask * sent_mask.T

    adj[-1, - sect_num - 1 :] = 1 #doc_sect

    if have_edu: return adj[:-1,:-1]
    else: return adj[-sect_num-sent_num-1:-1, -sect_num-sent_num-1:-1]

In [15]:
class Graph:
    def __init__(self, edus, sents, eduVecs, scores, sent_scores, sect_sent_mask, sent_edu_mask, golden, threds):
        # Kiểm tra độ dài của danh sách đầu vào
        assert len(eduVecs) == len(scores) == len(edus), "Số lượng eduVecs, scores và edus không khớp"
        self.sect_num = len(sect_sent_mask)
        self.sent_num = len(sent_edu_mask)
        
        # Tạo adjacency matrix từ mask
        self.adj = torch.from_numpy(mask_to_adj(sect_sent_mask, sent_edu_mask)).float()

        # Nối feature vectors với các vector không (cho các section và documents)
        self.feature = np.concatenate((np.array(eduVecs), np.zeros((self.sent_num + self.sect_num, eduVecs[0].size))))
        
        
        # Chuyển scores thành tensor và chuyển thành one-hot dựa trên ngưỡng
        left_neg_thred = threds[0]
        right_neg_thred = threds[1]
        pos_thred = threds[2]
        
        self.sent_score = torch.from_numpy(np.array(sent_scores)).float()
        self.sent_score_onehot = (self.sent_score >= pos_thred).float() 
        self.sent_score_onehot_neg = (self.sent_score <= right_neg_thred).float()

        # Lưu lại sentences và golden summary
        self.sents = np.array(sents)
        self.golden = golden

        # Lấy embedding cho golden summary
        self.goldenVec = get_Bert_vec(golden)

        # Khởi tạo các vector của node
        self.init_node_vec()

        # Chuyển feature thành tensor
        self.feature = torch.from_numpy(self.feature[-self.sect_num-self.sent_num:]).float()
        
        self.adj = torch.from_numpy(mask_to_adj(sect_sent_mask, sent_edu_mask, have_edu=False)).float()
        
    def init_node_vec(self):
        sect_num, sent_num = self.sect_num, self.sent_num

        for i in range(-sent_num-sect_num, -sect_num):
            mask = self.adj[i].clone()
            mask[-sent_num-sect_num:] = 0
            self.feature[i] = np.mean(self.feature[mask.bool()], axis=0)

        for i in range(-sect_num,): 
            mask = self.adj[i].clone()
            mask[-sect_num:] = 0
            self.feature[i] = np.mean(self.feature[mask.bool()], axis=0)

# Inference Functions

In [16]:
def val_e2e(val_dataloader, model, edu_num=0):
    model[0].eval()
    model[1].eval()

    batch_num = 0
    rouge2_score = []

    all_summaries = []
    all_gt = []
    
    for i, data in enumerate(val_dataloader):
        scores = val_e2e_batch(data, model)
        summary_text = get_summary(scores[0], data.sents, summary_max_word_num, edu_num)
        all_gt.append(data.golden)
        all_summaries.append(summary_text)
        
        rouge2_score.append(getRouge2(data.golden, summary_text, 'f'))
        batch_num += 1

    rouge2_score_mean = np.mean(rouge2_score)

    return rouge2_score_mean, all_summaries, all_gt, rouge2_score


def val_e2e_batch(data_batch, model):
    c_model = model[0]
    s_model = model[1]
    feature = data_batch.feature.unsqueeze(0)
    adj = data_batch.adj.unsqueeze(0)
    sect_num = data_batch.sect_num

    with torch.no_grad():
        pg = c_model(feature, adj, sect_num)
        x = s_model(pg, adj, sect_num)
        scores = torch.sigmoid(x.squeeze(-1))

    return scores

In [18]:
def get_summary(scores, edus, max_word_num, edu_num=0):
    ranked_score_idxs = torch.argsort(scores, dim=0, descending=True)
    wordCnt = 0
    summEduIDList = []
    for i in ranked_score_idxs:
        if wordCnt >= max_word_num and edu_num == 0:
            break
        elif edu_num > 0 and len(summEduIDList) == edu_num:
            break
        s = edus[i]

        replicated = False

        if scores.squeeze(0)[i].item() < 0:
            replicated = True

        for chosedID in summEduIDList:
            if getRouge2(edus[chosedID], s, 'p') >= 0.35:
                replicated = True
                break
        if replicated:
            continue

        wordCnt += len(s.split(' '))
        summEduIDList.append(i)
    summEduIDList = sorted(summEduIDList)
    
    # Xử lý token theo yêu cầu
    text = ' '.join([s for i, s in enumerate(edus) if i in summEduIDList])
    summary_array = [s for i, s in enumerate(edus) if i in summEduIDList]
    tokens = text.split()
    processed_tokens = []
    
    i = 0
    while i < len(tokens):
        token = tokens[i]
        
        # Kiểm tra điều kiện `abc_ _abc`
        if token.endswith('_') and i + 1 < len(tokens) and tokens[i + 1].startswith('_'):
            processed_tokens.append(token[:-1])  # Giữ lại phần 'abc'
            i += 2  # Bỏ qua token hiện tại và token tiếp theo
        else:
            # Chỉ loại bỏ dấu gạch dưới ở đầu hoặc cuối
            if token.startswith('_'):
                token = token[1:]
            if token.endswith('_'):
                token = token[:-1]
            processed_tokens.append(token)
            i += 1

    # Kết quả sau khi xử lý
    return ' '.join(processed_tokens)

# Process

In [29]:
args = {'gpu': 2, 'seed': 42, 'batch_size': 1, 'input': 768, 'hidden': 512, 'heads': 64,
       'epochs': 100, 'log_every': 20, 'lr': 0.0003, 'dropout': 0.3, 'num_layers': 3}

In [30]:
# Neg thred và pos thred
threds = [0, 0.4, 0.6]
topk_triplet = 5
summary_max_word_num = 160

In [31]:
# Model 
c_file_path = "./source/models/2_layers/c_19_0.3693_Train_test_abs_con_clean_style_1.mdl"
e_file_path = "./source/models/2_layers/e_19_0.3693_Train_test_abs_con_clean_style_1.mdl"

In [32]:
# Data 
test_label_path = "./source/models/2_layers/test_label.json"
test_input_path = "./source/models/2_layers/test_input_abstract_conclusion_clean.json"

In [33]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(args['seed'])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
stop_w = ['...']
with open('./source/stopwords.txt', 'r', encoding='utf-8') as f:
    for w in f.readlines():
        stop_w.append(w.strip())
stop_w.extend([c for c in '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~…“”’‘'])

In [35]:
c_graph_encoder = StepWiseGraphConvLayer(in_dim=768, out_dim=args['hidden'], hid_dim=args['hidden'],
                                         dropout_p=args['dropout'], act=nn.LeakyReLU(), nheads=args['heads'], iter=1).to(device)
s_graph_encoder = StepWiseGraphConvLayer(in_dim=768, out_dim=args['hidden'], hid_dim=args['hidden'],
                                         dropout_p=args['dropout'], act=nn.LeakyReLU(), nheads=args['heads'], iter=1).to(device)
contrast_filter = Contrast_Encoder(c_graph_encoder, args['hidden'], dropout_p=args['dropout']).to(device)
summarization_encoder = End2End_Encoder(s_graph_encoder, 768, args['hidden'], args['dropout']).to(device)

## Load 

In [36]:
with open(test_input_path, 'r', encoding='utf-8') as f:
     test_inputs = json.load(f)
        
with open(test_label_path, 'r', encoding='utf-8') as f:
     test_labels = json.load(f)

In [37]:
summarization_encoder.load_state_dict(torch.load(e_file_path, map_location=torch.device('cpu')), strict=False)
contrast_filter.load_state_dict(torch.load(c_file_path, map_location=torch.device('cpu')), strict=False)
model = [contrast_filter, summarization_encoder]

In [38]:
list_edu_sum = dict()
testGraphs = []

predicts = dict()
rouge2_score_means = []

predict_words = dict()

for ID in tqdm(test_labels):
    for i in range(len(test_labels[ID])):
        input_data = test_inputs[ID]
        label_data = test_labels[ID][i]
        graph = graph_construction(input_data, label_data, threds)
        rouge2_score_mean, summs, goldens, rouge2_score_list = val_e2e([graph], model)
        rouge2_score_means.append(rouge2_score_mean)
        predicts[ID] = summs

100%|███████████████████████████████████████████| 20/20 [00:15<00:00,  1.30it/s]


In [39]:
np.mean(rouge2_score_means)

0.3687403225806451

In [40]:
hypotheses = []
references = []

for ID in test_labels:
    hypotheses.append(predicts[ID][0])
    references.append(test_labels[ID])

In [41]:
score = get_evaluation(hypotheses, references)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
score

{'rouge-1': {'r': 0.59701, 'p': 0.54714, 'f': 0.56783},
 'rouge-2': {'r': 0.39945, 'p': 0.36408, 'f': 0.37873},
 'rouge-su4': {'r': 0.41171, 'p': 0.37542, 'f': 0.39041}}

In [43]:
for key in score:
    for typee in score[key]:
        print(score[key][typee], end="\t")

0.59701	0.54714	0.56783	0.39945	0.36408	0.37873	0.41171	0.37542	0.39041	