# IMPORT

In [1]:
!pip install gdown
!pip install rouge --quiet

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [2]:
import os
import json
import copy
import time
import torch
import pickle
import random
import string
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from rouge import Rouge
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

In [3]:
scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

# General Functions

In [4]:
# Define a logger to save the prints to a file
class Logger:
    def __init__(self, log_file='print.log'):
        self.log_file = log_file

    def log(self, message):
        with open(self.log_file, 'a') as f:
            f.write(message + '\n')

# Create a global logger instance
logger = Logger()

In [5]:
def getRouge2(ref, pred, kind): # tokenized input
    try:
        return round(Rouge().get_scores(pred.lower(), ref.lower())[0]['rouge-2'][kind], 4)
    except ValueError:
        return 0.0
def getRouge1(ref, pred, kind): # tokenized input
    return Rouge().get_scores(pred.lower(), ref.lower())[0]['rouge-1'][kind]
def getRougeL(ref, pred, kind): # tokenized input
    return Rouge().get_scores(pred.lower(), ref.lower())[0]['rouge-l'][kind]

# Graph Construction Functions

In [6]:
def get_Bert_vec(text, limit_len=400):
    sent = text.lower()
    input_ids = torch.tensor([tokenizer.encode(sent)])
    if input_ids.shape[1] > 256:
        edus = sent.split(' . ')
        wcnt = [len(s.split(' ')) for s in edus]
        wcnt_all = sum(wcnt)

        while wcnt_all > limit_len:
            wcnt_all -= wcnt[-1]
            edus.pop()
            wcnt.pop()

        part1, part2 = [], []
        for i, s in enumerate(edus):
            if sum(wcnt[:i]) <= wcnt_all / 2:
                part1.append(s)
            else:
                part2.append(s)

        edus = [' . '.join(part1), ' . '.join(part2)]
        input_ids = [torch.tensor([tokenizer.encode(sent)]) for sent in edus]
        with torch.no_grad():
            return torch.cat([scibert(input_ids[0])["pooler_output"], scibert(input_ids[1])["pooler_output"]],
                             dim=0)

    with torch.no_grad():
        features = scibert(input_ids)
    return features["pooler_output"]

In [7]:
def meanTokenVecs(sent, sp=0):
    return sent["spans"]

def getPositionEncoding(pos, d=768, n=10000):
    P = np.zeros(d)
    for i in np.arange(int(d/2)):
        denominator = np.power(n, 2*i/d)
        P[2*i] = np.sin(pos/denominator)
        P[2*i+1] = np.cos(pos/denominator)
    return P


def removeRedundant(text):
    text = text.lower()
    words = [w for w in text.split(' ') if w not in stop_w]
    return ' '.join(words)

def divideIntoSections(input_data):
    sent_num, edu_num = 0, 0
    paraList, paras, ids, newSentID = [], [], [], {}

    for d, doc in enumerate(input_data['docs']):
        edu_num += len(doc['sents'])
        paraList.append([])

        para, curOrgSentID = [], 0
        for s, sent in enumerate(doc['sents']):
            if sent['secid'] != curOrgSentID:  # Nếu section id thay đổi
                paraList[-1].append(' '.join(para))  # Tạo đoạn văn mới
                para, curOrgSentID = [], sent['secid']  # Cập nhật secid hiện tại
            para.append(sent['raw_sent'])  # Thêm câu vào đoạn văn hiện tại

        if para:  # Thêm đoạn văn cuối cùng vào danh sách
            paraList[-1].append(' '.join(para))

    # Xử lý đoạn văn và id
    for d, doc in enumerate(paraList):
        for p, para in enumerate(doc):
            paras.append(removeRedundant(para))
            ids.append((d, p))

    # Thay vì tính toán bằng LDA, sử dụng secid có sẵn để gán newSentID
    for d, doc in enumerate(input_data['docs']):
        for s, sent in enumerate(doc['sents']):
            newSentID[(d, sent['secid'])] = sent['secid']  # Dùng secid hiện tại luôn

    prevSentnum, sect_endsent = 0, []

    # Gán newSentID trực tiếp vào input_data['docs']
    for d, doc in enumerate(input_data['docs']):
        groupset = {}
        for s, sent in enumerate(doc['sents']):
            if newSentID[(d, sent['secid'])] not in groupset:
                groupset[newSentID[(d, sent['secid'])]] = len(groupset) + prevSentnum
            input_data['docs'][d]['sents'][s]['section_new'] = groupset[newSentID[(d, sent['secid'])]]
        prevSentnum = max(groupset.values()) + 1
        sect_endsent.append(max(groupset.values()))

    return sect_endsent, max(groupset.values()) + 1, edu_num

# Graph Encoder

In [8]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, hid_dim, layers=2, act=nn.LeakyReLU(), dropout_p=0.3, keep_last_layer=False):
        super(MLP, self).__init__()
        self.layers = layers
        self.act = act
        self.dropout = nn.Dropout(dropout_p)
        self.keep_last = keep_last_layer

        self.mlp_layers = nn.ModuleList([])
        if layers == 1:
            self.mlp_layers.append(nn.Linear(in_dim, out_dim))
        else:
            self.mlp_layers.append(nn.Linear(in_dim, hid_dim))
            for i in range(self.layers - 2):
                self.mlp_layers.append(nn.Linear(hid_dim, hid_dim))
            self.mlp_layers.append(nn.Linear(hid_dim, out_dim))

    def forward(self, x):
        for i in range(len(self.mlp_layers) - 1):
            x = self.dropout(self.act(self.mlp_layers[i](x)))
        if self.keep_last:
            x = self.mlp_layers[-1](x)
        else:
            x = self.act(self.mlp_layers[-1](x))
        return x

In [9]:
# borrowed from labml.ai
class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features: int, out_features: int, n_heads: int,
                 is_concat: bool = True, dropout: float = 0.6,
                 leaky_relu_negative_slope: float = 0.2):
        super().__init__()
        self.is_concat = is_concat
        self.n_heads = n_heads

        if is_concat:
            assert out_features % n_heads == 0
            self.n_hidden = out_features // n_heads
        else:
            self.n_hidden = out_features

        self.linear = nn.Linear(in_features, self.n_hidden * n_heads, bias=False)
        self.attn = nn.Linear(self.n_hidden * 2, 1, bias=False)
        self.activation = nn.LeakyReLU(negative_slope=leaky_relu_negative_slope)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
        n_nodes = h.shape[0]
        g = self.linear(h).view(n_nodes, self.n_heads, self.n_hidden)
        g_repeat = g.repeat(n_nodes, 1, 1)
        g_repeat_interleave = g.repeat_interleave(n_nodes, dim=0)
        g_concat = torch.cat([g_repeat_interleave, g_repeat], dim=-1)
        g_concat = g_concat.view(n_nodes, n_nodes, self.n_heads, 2 * self.n_hidden)
        e = self.activation(self.attn(g_concat)).squeeze(-1)
        assert adj_mat.shape[0] == 1 or adj_mat.shape[0] == n_nodes
        assert adj_mat.shape[1] == 1 or adj_mat.shape[1] == n_nodes
        assert adj_mat.shape[2] == 1 or adj_mat.shape[2] == self.n_heads

        e = e.masked_fill(adj_mat == 0, float(-1e9))
        a = self.softmax(e)
        a = self.dropout(a)
        attn_res = torch.einsum('ijh,jhf->ihf', a, g)

        if self.is_concat:
            return attn_res.reshape(n_nodes, self.n_heads * self.n_hidden)
        else:
            return attn_res.mean(dim=1)

In [10]:
class GAT(nn.Module):
    def __init__(self, in_features: int, n_hidden: int, n_classes: int, n_heads: int, dropout: float):
        super().__init__()
        self.layer1 = GraphAttentionLayer(in_features, n_hidden, n_heads, is_concat=True, dropout=dropout)
        self.activation = nn.ELU()
        self.output = GraphAttentionLayer(n_hidden, n_classes, 1, is_concat=False, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, adj_mat: torch.Tensor):
        x = x.squeeze(0)
        adj_mat = adj_mat.squeeze(0).unsqueeze(-1).bool()

        x = self.dropout(x)
        x = self.layer1(x, adj_mat)

        x = self.activation(x)
        x = self.dropout(x)
        return self.output(x, adj_mat).unsqueeze(0)

In [11]:
class StepWiseGraphConvLayer(nn.Module):
    def __init__(self, in_dim, out_dim, hid_dim, dropout_p=0.3, act=nn.LeakyReLU(), nheads=6, iter=1, final="att"):
        super().__init__()
        self.act = act
        self.dropout = nn.Dropout(dropout_p)
        self.iter = iter
        self.gat = nn.ModuleList([GAT(in_features=in_dim, n_hidden=hid_dim, n_classes=in_dim,
                                      dropout=dropout_p, n_heads=nheads) for _ in range(iter)])
        self.gat1 = nn.ModuleList([GAT(in_features=in_dim, n_hidden=hid_dim, n_classes=in_dim,
                                       dropout=dropout_p, n_heads=nheads) for _ in range(iter)])

        self.feature_fusion_layer = nn.Linear(in_dim * 2, in_dim)
        self.ffn = MLP(in_dim, in_dim, hid_dim, dropout_p=dropout_p, layers=3)
        self.out_ffn = MLP(in_dim, in_dim, hid_dim, dropout_p=dropout_p)

    def forward(self, feature, adj, sect_num):

        sent_adj = adj.clone()
        sent_adj[:, :, -sect_num:] = 0
        
        sect_adj = adj.clone()
        sect_adj[:, :, :-sect_num] = 0

        feature_sent = feature.clone()
        feature_sect = feature.clone()
        
        feature_resi = feature
        feature_sent_re = feature_sent
        feature_sect_re = feature_sect


        for i in range(0, self.iter):
            feature_sent = self.gat[i](feature_sent, sent_adj)
        feature_sent += feature_sent_re

        for i in range(0, self.iter):
            feature_sect = self.gat1[i](feature_sect, sect_adj)
        feature_sect += feature_sect_re
        
        feature = torch.concat([feature_sect, feature_sent], dim=-1)
        feature = self.dropout(F.leaky_relu(self.feature_fusion_layer(feature)))
        feature = self.ffn(feature)
        feature = self.out_ffn(feature) + feature_resi
        return feature

In [12]:
class Contrast_Encoder(nn.Module):
    def __init__(self, graph_encoder, hidden_dim, bert_hidden=768, in_dim=768, dropout_p=0.3):
        super(Contrast_Encoder, self).__init__()
        self.graph_encoder = graph_encoder
        self.common_proj_mlp = MLP(in_dim, in_dim, hidden_dim, dropout_p=dropout_p, act=nn.LeakyReLU())

    def forward(self, p_gfeature, p_adj, sect_num):
        pg = self.graph_encoder(p_gfeature.float(), p_adj.float(), sect_num)
        pg = self.common_proj_mlp(pg)
        
        return pg

In [13]:
class End2End_Encoder(nn.Module):
    def __init__(self, graph_encoder, in_dim, hidden_dim, dropout_p):
        super(End2End_Encoder, self).__init__()
        self.graph_encoder = graph_encoder
        self.dropout = nn.Dropout(dropout_p)
        self.out_proj_layer_mlp = MLP(in_dim, in_dim, hidden_dim, act=nn.LeakyReLU(), dropout_p=dropout_p, layers=2)
        self.final_layer = nn.Linear(in_dim, 1)

    def forward(self, x, adj, sect_num):
        x = self.graph_encoder(x.float(), adj.float(), sect_num)
        
        x_sent = x[:, :-sect_num, :]
        x_sent = self.out_proj_layer_mlp(x_sent)
        x_sent = self.final_layer(x_sent)
        
        return x_sent

In [14]:
def mask_to_adj(sect_sent_mask, sent_edu_mask, have_edu=True):
    sect_sent_mask = np.array(sect_sent_mask)
    sent_edu_mask = np.array(sent_edu_mask)

    edu_num = sent_edu_mask.shape[1]
    sent_num = sent_edu_mask.shape[0]
    sect_num = sect_sent_mask.shape[0]
    adj = np.zeros((edu_num + sent_num + sect_num + 1, edu_num + sent_num + sect_num + 1))
    # section connection
    adj[-sent_num - sect_num - 1:-sect_num - 1, 0:-sent_num - sect_num - 1] = sent_edu_mask
    adj[0:-sent_num - sect_num - 1, -sent_num - sect_num - 1:-sect_num - 1] = sent_edu_mask.T
    #sec_sec
    for i in range(0, sect_num):
        sect_mask = sect_sent_mask[i]

        # Đảm bảo sect_mask là mảng numpy và có chiều đúng để reshape: đảm bảo rằng sect_mask có đúng dạng để có thể nhân ma trận.
        if sect_mask.ndim == 1:
            sect_mask = sect_mask.reshape((1, -1))
        elif sect_mask.ndim == 0:
            sect_mask = np.array([sect_mask])  # Chuyển thành mảng 1D nếu là số đơn lẻ

        adj[edu_num:-sect_num - 1, edu_num:-sect_num - 1] += sect_mask * sect_mask.T #sec_sec của từng doc

    adj[-sect_num - 1:-1, -sent_num - sect_num - 1:-sect_num - 1] = sect_sent_mask
    adj[-sent_num - sect_num - 1:-sect_num - 1, -sect_num - 1:-1] = sect_sent_mask.T
    adj[-sect_num - 1: -1, -sect_num-1: -1] = 1 
    
    # build sentence connection
    for i in range(0, sent_num):
        sent_mask = sent_edu_mask[i]

        # Đảm bảo sent_mask là mảng numpy và có chiều đúng để reshape
        if sent_mask.ndim == 1:
            sent_mask = sent_mask.reshape((1, -1))
        elif sent_mask.ndim == 0:
            sent_mask = np.array([sent_mask])  # Chuyển thành mảng 1D nếu là số đơn lẻ

        adj[:edu_num, :edu_num] += sent_mask * sent_mask.T

    adj[-1, - sect_num - 1 :] = 1 #doc_sect

    if have_edu: return adj[:-1,:-1]
    else: return adj[-sect_num-sent_num-1:-1, -sect_num-sent_num-1:-1]

In [15]:
class Graph:
    def __init__(self, edus, sents, eduVecs, scores, sent_scores, sect_sent_mask, sent_edu_mask, golden, threds):
        # Kiểm tra độ dài của danh sách đầu vào
        assert len(eduVecs) == len(scores) == len(edus), "Số lượng eduVecs, scores và edus không khớp"
        self.sect_num = len(sect_sent_mask)
        self.sent_num = len(sent_edu_mask)
        
        # Tạo adjacency matrix từ mask
        self.adj = torch.from_numpy(mask_to_adj(sect_sent_mask, sent_edu_mask)).float()

        # Nối feature vectors với các vector không (cho các section và documents)
        self.feature = np.concatenate((np.array(eduVecs), np.zeros((self.sent_num + self.sect_num, eduVecs[0].size))))
        
        
        # Chuyển scores thành tensor và chuyển thành one-hot dựa trên ngưỡng
        left_neg_thred = threds[0]
        right_neg_thred = threds[1]
        pos_thred = threds[2]
        
        self.sent_score = torch.from_numpy(np.array(sent_scores)).float()
        self.sent_score_onehot = (self.sent_score >= pos_thred).float() 
        self.sent_score_onehot_neg = (self.sent_score <= right_neg_thred).float()

        # Lưu lại sentences và golden summary
        self.sents = np.array(sents)
        self.golden = golden

        # Lấy embedding cho golden summary
        self.goldenVec = get_Bert_vec(golden)

        # Khởi tạo các vector của node
        self.init_node_vec()

        # Chuyển feature thành tensor
        self.feature = torch.from_numpy(self.feature[-self.sect_num-self.sent_num:]).float()
        
        self.adj = torch.from_numpy(mask_to_adj(sect_sent_mask, sent_edu_mask, have_edu=False)).float()
        
    def init_node_vec(self):
        sect_num, sent_num = self.sect_num, self.sent_num

        for i in range(-sent_num-sect_num, -sect_num):
            mask = self.adj[i].clone()
            mask[-sent_num-sect_num:] = 0
            self.feature[i] = np.mean(self.feature[mask.bool()], axis=0)

        for i in range(-sect_num,): 
            mask = self.adj[i].clone()
            mask[-sect_num:] = 0
            self.feature[i] = np.mean(self.feature[mask.bool()], axis=0)

## Loss Functions

In [16]:
def l2_distance(a, b):
    """Calculate L2 distances between two tensors."""
    # Ensure the dimensions are compatible for broadcasting
    return torch.norm(a.unsqueeze(1) - b.unsqueeze(0), dim=2)  # [n, m] where n is size of a, m is size of b

def _similarity(h1: torch.Tensor, h2: torch.Tensor):
    """Calculate similarity between two sets of vectors."""
    h1 = F.normalize(h1, dim=1)  # Normalize each vector in h1
    h2 = F.normalize(h2, dim=1)  # Normalize each vector in h2
    return h1 @ h2.t()  # Compute the dot product between vectors

In [17]:
class InfoNCE(nn.Module):
    def __init__(self, tau):
        super(InfoNCE, self).__init__()
        self.tau = tau

    def forward(self, anchor, sample, pos_mask, neg_mask, *args, **kwargs):
        sim = _similarity(anchor, sample) / self.tau
        if len(anchor) > 1:
            sim, _ = torch.max(sim, dim=0)
        exp_sim = torch.exp(sim)
        loss = torch.log((exp_sim * pos_mask).sum(dim=1)) - torch.log((exp_sim * (pos_mask + neg_mask)).sum(dim=1))
        return -loss.mean()

# Training Functions

In [18]:
def train_e2e(train_dataloader, model, optimizer):
    model[0].train()
    model[1].train()
    c_loss, s_loss, loss, batch_num = 0, 0, 0, 0
    print_epo = 20
    rouge2_score = []

    for i, data in enumerate(train_dataloader):
        batch_loss, bc_loss, bs_loss, scores = train_e2e_batch(data, model, optimizer)
        loss += batch_loss
        c_loss += bc_loss
        s_loss += bs_loss
        batch_num += 1

        abs_text = data.golden
        summary_text = get_summary(scores[0], data.sents, summary_max_word_num)
        rouge2_score.append(getRouge2(data.golden, summary_text, 'f'))

        if i % print_epo == 0:
            print("Batch {}, Loss: {}".format(i, loss / batch_num))
            print("Batch {}, C-Loss: {}".format(i, c_loss / batch_num))
            print("Batch {}, S-Loss: {}".format(i, s_loss / batch_num))

    return loss / batch_num, np.mean(rouge2_score)

def train_e2e_batch(data_batch, model, optimizer):
    c_model = model[0]
    s_model = model[1]

    optimizer.zero_grad()
    feature = data_batch.feature.unsqueeze(0)
    adj = data_batch.adj.unsqueeze(0)
    sect_num = data_batch.sect_num
    sent_labels = data_batch.sent_score_onehot.unsqueeze(0)
    sent_labels_neg = data_batch.sent_score_onehot_neg.unsqueeze(0)
    sent_scores = data_batch.sent_score.unsqueeze(0).float()
    goldenVec = data_batch.goldenVec

    pg = c_model(feature.cuda(), adj.cuda(), sect_num)
    x_sent = s_model(pg.cuda(), adj.cuda(), sect_num)
    
    s_loss = F.binary_cross_entropy_with_logits(x_sent.squeeze(-1), sent_labels.cuda(), pos_weight=torch.tensor(10).cuda())
    pg = pg.squeeze(0)


    pos_mask = torch.zeros(1, feature.shape[1])
    pos_mask[:, :-sect_num] = sent_labels
    pos_mask[:, -sect_num:] = 1

    neg_mask = torch.zeros(1, feature.shape[1])
    neg_mask[:, :-sect_num] = sent_labels_neg
    neg_mask[:, -sect_num:] = 1
    
    # Tính toán Triplet loss
    infonce = InfoNCE(tau=0.2)
    c_loss = infonce(goldenVec.cuda(), pg, pos_mask.cuda(), neg_mask.cuda())

    loss = s_loss + 1. * c_loss
    loss.backward()
    optimizer.step()

    return loss.item(), c_loss.item(), s_loss.item(), torch.sigmoid(x_sent.squeeze(-1))

def val_e2e(val_dataloader, model, mode='val', edu_num=0):
    model[0].eval()
    model[1].eval()
    loss, c_loss, s_loss = 0,0,0
    batch_num = 0
    rouge2_score = []

    all_summaries = []
    all_gt = []
    for i, data in enumerate(val_dataloader):
        cur_loss, c_loss_b, s_loss_b, scores = val_e2e_batch(data, model)
        loss += cur_loss
        c_loss += c_loss_b
        s_loss += s_loss_b

        abs_text = data.golden
        summary_text = get_summary(scores[0], data.sents, summary_max_word_num, edu_num)
        all_gt.append(data.golden)
        all_summaries.append(summary_text)
        rouge2_score.append(getRouge2(data.golden, summary_text, 'f'))
        batch_num += 1

    rouge2_score_mean = np.mean(rouge2_score)
    loss = loss / batch_num
    c_loss /= batch_num
    s_loss /= batch_num

    if mode != 'val':
        return rouge2_score_mean, all_summaries, all_gt, rouge2_score
    return rouge2_score_mean, loss, c_loss, s_loss

def val_e2e_batch(data_batch, model):
    c_model = model[0]
    s_model = model[1]
    feature = data_batch.feature.unsqueeze(0)
    adj = data_batch.adj.unsqueeze(0)
    sect_num = data_batch.sect_num
    goldenVec = data_batch.goldenVec

    sent_labels = data_batch.sent_score_onehot.unsqueeze(0)
    sent_labels_neg = data_batch.sent_score_onehot_neg.unsqueeze(0)
    sent_scores = data_batch.sent_score.unsqueeze(0).float()

    with torch.no_grad():
        pg = c_model(feature.cuda(), adj.cuda(), sect_num)
        x_sent = s_model(pg.cuda(), adj.cuda(), sect_num)

        pg = pg.squeeze(0)

        pos_mask = torch.zeros(1, feature.shape[1])
        pos_mask[:, :-sect_num] = sent_labels
        pos_mask[:, -sect_num:] = 1

        neg_mask = torch.zeros(1, feature.shape[1])
        neg_mask[:, :-sect_num] = sent_labels_neg
        neg_mask[:, -sect_num:] = 1

        # Tính toán Triplet loss
        # Kiểm tra nếu pos_mask có bất kỳ giá trị nào khác 0
        infonce = InfoNCE(tau=0.2)
        c_loss = infonce(goldenVec.cuda(), pg, pos_mask.cuda(), neg_mask.cuda())

        s_loss = F.binary_cross_entropy_with_logits(x_sent.squeeze(-1), sent_labels.cuda(), pos_weight=torch.tensor(10).cuda())

        loss = c_loss * 1. + s_loss
        scores = torch.sigmoid(x_sent.squeeze(-1))

    return loss.item(), c_loss.item(), s_loss.item(), scores

# Inference Functions

In [19]:
def get_summary(scores, edus, max_word_num, edu_num=0):
    assert  len(scores) == len(edus)
    ranked_score_idxs = torch.argsort(scores, dim=0, descending=True)
    wordCnt = 0
    summEduIDList = []
    for i in ranked_score_idxs:
        if wordCnt >= max_word_num and edu_num == 0: break
        elif edu_num > 0 and len(summEduIDList) == edu_num: break
        s = edus[i]

        replicated = False
        
        # Kiểm tra điểm số âm và Rouge2 overlap để loại bỏ câu lặp
        if scores.squeeze(0)[i].item() < 0:
            replicated = True
        
        for chosedID in summEduIDList:
            if getRouge2(edus[chosedID], s, 'p') >= 0.65:
                replicated = True
                break
        if replicated: continue

        wordCnt += len(s.split(' '))
        summEduIDList.append(i)
    
    
    # Xử lý token theo yêu cầu và tạo ra văn bản tóm tắt cuối cùng
    summEduIDList = sorted(summEduIDList)
    text = ' '.join([s for i, s in enumerate(edus) if i in summEduIDList])
    tokens = text.split()
    processed_tokens = []
    
    i = 0
    while i < len(tokens):
        token = tokens[i]
        
        # Kiểm tra điều kiện `abc_ _abc`
        if token.endswith('_') and i + 1 < len(tokens) and tokens[i + 1].startswith('_'):
            processed_tokens.append(token[:-1])  # Giữ lại phần 'abc'
            i += 2  # Bỏ qua token hiện tại và token tiếp theo
        else:
            # Chỉ loại bỏ dấu gạch dưới ở đầu hoặc cuối
            if token.startswith('_'):
                token = token[1:]
            if token.endswith('_'):
                token = token[:-1]
            processed_tokens.append(token)
            i += 1

    # Kết quả sau khi xử lý
    final_summary_text = ' '.join(processed_tokens)
    return final_summary_text

# Model Training Process

In [20]:
# stopwords
!gdown --id 1SGfX4ZJwTLL5jn9SZbGsoFHnDXXmSWH3

# train_label
!gdown --id 12mnUs-QFHYDH36ulKZt8z1Do8_4fJRsI

# test_label
!gdown --id 1iKoS5q50kStGt9JQmwPyG447dWb2ARvN

# CONFIG
# train_input_abstract_conclusion_citing_clean
!gdown --id 1mSMyulLKQMdDqOQ6_tDKrSnK3C9xAEIi

# test_input_abstract_conclusion_citing_clean
!gdown --id 1YDvYdWZnhbL0INJAJc3fA8Bwkh5_j-3Y

Downloading...
From: https://drive.google.com/uc?id=1SGfX4ZJwTLL5jn9SZbGsoFHnDXXmSWH3
To: /kaggle/working/stopwords.txt
100%|██████████████████████████████████████| 2.63k/2.63k [00:00<00:00, 12.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=12mnUs-QFHYDH36ulKZt8z1Do8_4fJRsI
To: /kaggle/working/train_label.json
100%|██████████████████████████████████████| 43.5k/43.5k [00:00<00:00, 80.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iKoS5q50kStGt9JQmwPyG447dWb2ARvN
To: /kaggle/working/test_label.json
100%|██████████████████████████████████████| 64.4k/64.4k [00:00<00:00, 82.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mSMyulLKQMdDqOQ6_tDKrSnK3C9xAEIi
To: /kaggle/working/train_input_abstract_conclusion_v0.json
100%|███████████████████████████████████████| 68.6M/68.6M [00:00<00:00, 180MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YDvYdWZnhbL0INJAJc3fA8Bwkh5_j-3Y
To: /kaggle/working/test_input_abstract_conclusion_v0.json
1

## CONFIG

In [21]:
model_version = "Train_test_abs_con_citing_clean_style_1"
args = {'gpu': 2, 'seed': 42, 'batch_size': 1, 'input': 768, 'hidden': 512, 'heads': 32,
       'epochs': 100, 'log_every': 20, 'lr': 0.0003, 'dropout': 0.3, 'num_layers': 3}

model_save_root_path = '/kaggle/working/'
c_patient = 30
best_r2, best_c_loss, best_s_loss = 0, 10000, 10000
history = {'loss': [], 'val_loss': []}

In [22]:
# Neg thred và pos thred
threds = [0, 0.4, 0.6]
topk_triplet = 5
summary_max_word_num = 160

In [23]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(args['seed'])

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [25]:
with open("/kaggle/working/train_label.json", 'r', encoding='utf-8') as f:
     train_labels = json.load(f)
with open("/kaggle/working/test_label.json", 'r', encoding='utf-8') as f:
     test_labels = json.load(f)
with open("/kaggle/working/train_input_abstract_conclusion_v0.json", 'r', encoding='utf-8') as f:
     train_inputs = json.load(f)
with open("/kaggle/working/test_input_abstract_conclusion_v0.json", 'r', encoding='utf-8') as f:
     test_inputs = json.load(f)

In [26]:
stop_w = ['...']
with open('/kaggle/working/stopwords.txt', 'r', encoding='utf-8') as f:
    for w in f.readlines():
        stop_w.append(w.strip())
stop_w.extend([c for c in '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~…“”’‘'])

## Load and construct graph

In [27]:
def graph_construction(input_data, label_data, threds):
    edus, eduVecs, scores, sentIDs = [], [], [], []
    
    sent_scores = []
    sent_text = []
    
    sect_endsent, sent_num, edu_num = divideIntoSections(input_data)
    sect_sent_mask = np.zeros((len(input_data['docs']), sent_num))
    sent_edu_mask = np.zeros((sent_num, edu_num))
    cur_sent, cur_edu = 0, 0

    for d, doc in enumerate(input_data['docs']):
        sect_sent_mask[d][cur_sent:sect_endsent[d] + 1] = 1
        cur_sent = sect_endsent[d] + 1
        
        sent_array = {}

        for s, sent in enumerate(doc['sents']):
            edus.append(sent['raw_sent'])
            eduVecs.append(meanTokenVecs(sent) + getPositionEncoding(d) + getPositionEncoding(s))

            rouge_score = getRouge2(label_data, sent['raw_sent'], 'p')
            scores.append(rouge_score)

            sent_edu_mask[sent['section_new'], cur_edu] = 1
            cur_edu += 1
            
            # Gom các câu có cùng `secid` vào `text_section`
            sentid = sent['secid']
            if sentid not in sent_array:
                sent_array[sentid] = ""
            sent_array[sentid] += sent['raw_sent'] + " "
        
        # Tính điểm ROUGE cho từng đoạn `text_section`
        for sentid, sent_raw in sent_array.items():
            sent_rouge_score = getRouge2(label_data, sent_raw.strip(), 'p')
            sent_scores.append(sent_rouge_score)
            sent_text.append(sent_raw)
            
    sents = sent_text

    tmp_graph = Graph(edus, sents, eduVecs, scores, sent_scores, sect_sent_mask, sent_edu_mask, label_data, threds)
    return tmp_graph

In [28]:
trainGraphs = []

for ID in tqdm(train_labels):
    input_data = train_inputs[ID]
    label_data = train_labels[ID][0]
    tmp_graph = graph_construction(input_data, label_data, threds)
    trainGraphs.append(tmp_graph)

100%|██████████| 40/40 [00:30<00:00,  1.33it/s]


In [29]:
testGraphs = []

for ID in tqdm(test_labels):
    for i in range(len(test_labels[ID])):
        input_data = test_inputs[ID]
        label_data = test_labels[ID][i]
        tmp_graph = graph_construction(input_data, label_data, threds)
        testGraphs.append(tmp_graph)

100%|██████████| 20/20 [00:30<00:00,  1.55s/it]


## Train model

In [30]:
c_graph_encoder = StepWiseGraphConvLayer(in_dim=768, out_dim=args['hidden'], hid_dim=args['hidden'],
                                         dropout_p=args['dropout'], act=nn.LeakyReLU(), nheads=args['heads'], iter=1).to(device)
s_graph_encoder = StepWiseGraphConvLayer(in_dim=768, out_dim=args['hidden'], hid_dim=args['hidden'],
                                         dropout_p=args['dropout'], act=nn.LeakyReLU(), nheads=args['heads'], iter=1).to(device)
contrast_filter = Contrast_Encoder(c_graph_encoder, args['hidden'], dropout_p=args['dropout']).to(device)
summarization_encoder = End2End_Encoder(s_graph_encoder, 768, args['hidden'], args['dropout']).to(device)

In [31]:
optimizer = torch.optim.Adam([ {'params': summarization_encoder.parameters()},
                            {'params': contrast_filter.parameters()}], lr=args['lr'], weight_decay=1e-5)

In [32]:
trainset, valset = trainGraphs, testGraphs

In [33]:
for i in range(args['epochs']):
    print("Epoch {}".format(i))
    random.shuffle(trainset)

    if c_patient < 0:
        for p in contrast_filter.parameters():
            p.requires_grad = False
        print("Stop Training Contrast")

    model = [contrast_filter, summarization_encoder]
    loss, rouge2_score = train_e2e(trainset, model, optimizer)
    history['loss'].append(loss)
    print("At Epoch {}, Train Loss: {}, R2 score: {}".format(i, loss, rouge2_score))
    torch.cuda.empty_cache()

    rouge2_score, loss, c_loss, s_loss = val_e2e(valset, model)
    torch.cuda.empty_cache()
    history['val_loss'].append(loss)
    print("At Epoch {}, Val Loss: {}, Val CLoss: {}, Val SLoss: {},Val R2: {}".format(i, loss, c_loss, s_loss, rouge2_score))
    if rouge2_score > best_r2:
        rouge2_score = round(rouge2_score, 4)

        model_save_path = os.path.join(model_save_root_path, "e_{}_{}_{}.mdl".format(i, rouge2_score, model_version))
        torch.save(summarization_encoder.state_dict(), model_save_path)

        model_save_path = os.path.join(model_save_root_path, "c_{}_{}_{}.mdl".format(i, rouge2_score, model_version))
        torch.save(contrast_filter.state_dict(), model_save_path)
        best_r2 = rouge2_score
        print("Epoch {} Has best R2 Score of {}, saved Model to {}".format(i, best_r2, model_save_path))

    if c_loss < best_c_loss and c_patient >= 0:
        best_c_loss = c_loss
        c_patient = 30
    else:
        c_patient -= 1

Epoch 0
Batch 0, Loss: 3.446943998336792
Batch 0, C-Loss: 1.2129547595977783
Batch 0, S-Loss: 2.2339892387390137
Batch 20, Loss: 2.404976260094416
Batch 20, C-Loss: 0.987259104138329
Batch 20, S-Loss: 1.417717158794403
At Epoch 0, Train Loss: 2.399048948287964, R2 score: 0.2145975
At Epoch 0, Val Loss: 2.61875069333661, Val CLoss: 0.7072135248491841, Val SLoss: 1.9115371713715215,Val R2: 0.30971290322580647
Epoch 0 Has best R2 Score of 0.3097, saved Model to /kaggle/working/c_0_0.3097_Train_test_abs_con_citing_clean_style_1.mdl
Epoch 1
Batch 0, Loss: 1.9659512042999268
Batch 0, C-Loss: 0.8419032096862793
Batch 0, S-Loss: 1.1240479946136475
Batch 20, Loss: 2.227609844434829
Batch 20, C-Loss: 0.7363731520516532
Batch 20, S-Loss: 1.4912366952214922
At Epoch 1, Train Loss: 2.1870339304208755, R2 score: 0.21740500000000001
At Epoch 1, Val Loss: 2.5563116554291017, Val CLoss: 0.7007630832733647, Val SLoss: 1.855548576001198,Val R2: 0.31164516129032255
Epoch 1 Has best R2 Score of 0.3116, sav