In [None]:
import gensim
import gensim.downloader

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext import data
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.vocab import Vectors
import time
import pandas as pd
import math
import random
from tqdm import tqdm
from torchtext.vocab import GloVe
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(1)

In [None]:
tokenize = word_tokenize
Article = data.Field(sequential=True, tokenize=tokenize, lower=True, init_token='<bos>', eos_token='<eos>',
                  pad_token='<pad>', unk_token='<oov>')
Title = data.Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', 
                  pad_token='<pad>', unk_token='<oov>')
Len = data.Field(sequential=False, use_vocab=False)
ID = data.Field(sequential=False, use_vocab=False)

In [None]:
pmi_dct = dict()

In [None]:
train_path = 'train_data.csv'
valid_path = 'valid_data.csv'
test_path = 'test_data.csv'


class MyDataset(data.Dataset):
    def __init__(self, csv_path, article_field, title_field, cat, **kwargs):

        csv_data = pd.read_csv(csv_path)

        fields = [("id", ID), ("article", article_field), ("title", title_field), ("art_len", Len), ("title_len", Len)]
        examples = []
    
        
        for id, text, label in tqdm(zip(csv_data.index, csv_data['article'], csv_data['title'])):
            examples.append(data.Example.fromlist([self.getID(id, cat), text, label, len(word_tokenize(text))+2, len(word_tokenize(label))+1], fields))
        super(MyDataset, self).__init__(examples, fields)

    def getID(self, id, cat):
        if cat == 'train':
            return id
        elif cat == 'valid':
            return id + 100000
        elif cat == 'test':
            return id + 110000
        
    def shuffle(self, text):
        text = np.random.permutation(text.strip().split())
        return ' '.join(text)

In [None]:
train = MyDataset(train_path, article_field=Article, title_field=Title, cat='train')
valid = MyDataset(valid_path, article_field=Article, title_field=Title, cat='valid')
test = MyDataset(test_path, article_field=Article, title_field=Title, cat='test')

In [None]:
from collections import defaultdict
vectors = Vectors(name='glove-wiki-gigaword-300.txt')
Article.build_vocab(train, valid, test, vectors=vectors)
default = Article.vocab.stoi['<oov>']
Article.vocab.stoi.default_factory = lambda: default
Title.vocab = Article.vocab

In [None]:
from torchtext.data import Iterator, BucketIterator

train_iter, val_iter = BucketIterator.splits(
        (train, valid), 
        batch_size=16, 
        device=device, 
        sort_key=lambda x: len(x.article),
        sort_within_batch=True,
        repeat=False 
)


# GCN

## layer

In [None]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, device, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features)).to(device)
        self.device = device
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features)).to(device)
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.mm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import math
import numpy as np



class GCN(nn.Module):
    def __init__(self, input_dim, nfeat, nhid, nout, dropout, device):
        super(GCN, self).__init__()
        self.input_dim = input_dim
        self.nfeat = nfeat
        self.gc1 = GraphConvolution(nfeat, nhid, device)
        self.gc2 = GraphConvolution(nhid, nout, device)
        self.device = device
        self.dropout = dropout

    def forward(self, input_seq, seq_embed, input_length, id=None):
        pad_len = len(input_seq)
        input_seq = input_seq[:input_length]
        
        adj, uni_words = self.load_data(input_seq, id)
        uni2idx = [list(input_seq).index(ele) for ele in uni_words]
        seq2idx = [list(uni_words).index(ele) for ele in input_seq]
        x = seq_embed[uni2idx]
        x = torch.cat((x, torch.zeros((1, self.nfeat), device=self.device)),0)
        # x: [node_num, feature_dim]
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        x = torch.tanh(x) # [node_num, embded_dim]
        return x[-1,:]

        

    def unranked_unique(self, nparray):
        n_unique = len(np.unique(nparray))
        ranked_unique = np.zeros([n_unique])
        i = 0
        for x in nparray:
            if x not in ranked_unique:
                ranked_unique[i] = x
                i += 1
        return ranked_unique

    def load_data(self, word_seq, id):
        if id is not None:
            id = int(id)
        word_seq = torch.tensor(word_seq, device=self.device)
        words = torch.unique(word_seq, sorted=False)
        word_num = len(words)
        if id is None:
            adj = self.get_pmi(word_seq, words, 3, self.device)
        elif id in pmi_dct.keys():
            adj = pmi_dct[id]
        else:
            adj = self.get_pmi(word_seq, words, 3, self.device)
            adj = torch.tensor(self.normalize(adj), dtype=torch.float32, device=self.device)
            pmi_dct[id] = adj
        words = torch.tensor(words, dtype=torch.int64, device=self.device)
        return adj, words

    def get_pmi(self, word_seq, uni_words, window_size, device):
        word_len = len(word_seq)
        word_num = len(uni_words)
        seq2idx = {int(ele): list(uni_words).index(ele) for ele in uni_words}
        window_num = word_len - window_size + 1
        if window_size > word_len:
            window_size = word_len
        win_num_matrix = torch.zeros((word_num+1, word_num+1), device=device)

        for i in range(window_num):
            words = list(torch.unique(word_seq[i:i+window_size], sorted=False))
            indicies = [seq2idx[int(ele)] for ele in words]
            for ele in indicies:
                win_num_matrix[ele,indicies] += 1

        pos_matrix = win_num_matrix / window_num
        pos_matrix[-1,-1]=1
        diag = torch.mul(pos_matrix, torch.eye(word_num+1, device=device)).inverse()
        pos_matrix = torch.mm(diag, pos_matrix)
        pos_matrix = torch.mm(pos_matrix, diag)
        pmi_matrix = torch.clamp(torch.log(pos_matrix), min=0)
        diag = torch.diag(pmi_matrix)
        diag = 1 - diag
        pmi_matrix = pmi_matrix + torch.diag_embed(diag)
        pmi_matrix[:,-1] += 1
        pmi_matrix[-1,:] += 1
        pmi_matrix[-1,-1] = 0
        return pmi_matrix

    def normalize(self, mx):
        """Symmetrically normalize adjacency matrix"""
        rowsum = mx.sum(1)
        r_inv = torch.pow(rowsum, -0.5).flatten()
        r_inv = torch.clamp(r_inv, min=0)
        r_mat_inv = torch.diag(r_inv)
        mx = torch.mm(r_mat_inv, mx)
        mx = torch.mm(mx, r_mat_inv)
        return mx


In [None]:
class Encoder(nn.Module):
    def __init__(self, gcn, input_dim, emb_dim, hid_dim, n_layers, device, dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.GCN = gcn
        self.embedding = nn.Embedding(input_dim, emb_dim)
        weight_matrix = Article.vocab.vectors
        self.embedding.weight.data.copy_(weight_matrix)
        self.device = device
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        
    def forward(self, input_seqs, input_lengths, ids, hidden):
        # input_seqs = [seq_len, batch]
        embedded = self.embedding(input_seqs)
        # embedded = [pad_len, batch, embed_dim]
        gcn_outputs = torch.zeros((embedded.shape[1], self.hid_dim), device=self.device)
        for i in range(embedded.shape[1]):
            gcn_input = embedded[:,i,:]
            if ids is not None:
                gcn_output = self.GCN(input_seqs[:,i], gcn_input, input_lengths[i], ids[i]) # pad_len, emb_dim
            else:
                gcn_output = self.GCN(input_seqs[:,i], gcn_input, input_lengths[i])
            gcn_outputs[i,:] = gcn_output
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        # gcn_outputs [batch, embed_dim]
        outputs, hidden = self.gru(packed, hidden)        
        gcn_outputs = gcn_outputs.repeat(hidden.shape[0],1,1)
        hidden = torch.cat((hidden, gcn_outputs), -1)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # outputs = [seq_len, batch, hid_dim * n directions]
         # hidden = [n_layers * n_directions, batch, hid_dim]
        # output_lengths = [batch]
        return outputs, hidden


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        weight_matrix = Article.vocab.vectors
        self.embedding.weight.data.copy_(weight_matrix)
        self.gru = nn.GRU(emb_dim, hid_dim*2, n_layers, dropout=dropout, bidirectional=bidirectional)
        
        if bidirectional:
            self.fc_out = nn.Linear(hid_dim*4, output_dim)
        else:
            self.fc_out = nn.Linear(hid_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, token_inputs, hidden):
        # token_inputs = [batch]
        batch_size = token_inputs.size(0)
        embedded = self.dropout(self.embedding(token_inputs).view(1, batch_size, -1))
        # embedded = [1, batch, emb_dim]
        output, hidden = self.gru(embedded, hidden)
        # output = [1, batch,  n_directions * hid_dim]
        # hidden = [n_layers * n_directions, batch, hid_dim]
        
        output = self.fc_out(output.squeeze(0))
        output = self.softmax(output)
        # output = [batch, output_dim]
        return output, hidden

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 device, 
                 predict=False, 
                 basic_dict=None,
                 max_len=100
                 ):
        super(Seq2Seq, self).__init__()
        
        self.device = device

        self.encoder = encoder
        self.decoder = decoder

        self.predict = predict  
        self.basic_dict = basic_dict  
        self.max_len = max_len  

        self.enc_n_layers = self.encoder.gru.num_layers
        self.enc_n_directions = 2 if self.encoder.gru.bidirectional else 1
        self.dec_n_directions = 2 if self.decoder.gru.bidirectional else 1

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        assert self.enc_n_directions >= self.dec_n_directions, \
            "If decoder is bidirectional, encoder must be bidirectional either!"
        
    def forward(self, input_batches, input_lengths, target_batches=None, target_lengths=None, ids=None, teacher_forcing_ratio=0.5):
        # input_batches = target_batches = [seq_len, batch]
        batch_size = input_batches.size(1)
        BOS_token = self.basic_dict["<bos>"]
        EOS_token = self.basic_dict["<eos>"]
        PAD_token = self.basic_dict["<pad>"]
        encoder_hidden = torch.zeros(self.enc_n_layers*self.enc_n_directions, batch_size, self.encoder.hid_dim, device=self.device)
        
        # encoder_output = [seq_len, batch, hid_dim * n directions]
        # encoder_hidden = [n_layers*n_directions, batch, hid_dim*2]
        encoder_output, encoder_hidden = self.encoder(
            input_batches, input_lengths, ids, encoder_hidden)

        decoder_input = torch.tensor([BOS_token] * batch_size, dtype=torch.long, device=self.device)
        if self.enc_n_directions == self.dec_n_directions:
            decoder_hidden = encoder_hidden
        else:
            L = encoder_hidden.size(0)
            decoder_hidden = encoder_hidden[range(0, L, 2)] + encoder_hidden[range(1, L, 2)]

        if self.predict:
            assert batch_size == 1, "batch_size of predict phase must be 1!"
            output_tokens = []

            while True:
                decoder_output, decoder_hidden = self.decoder(
                    decoder_input, decoder_hidden
                )
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(1)  
                output_token = topi.squeeze().detach().item()
                if output_token == EOS_token or len(output_tokens) == self.max_len:
                    break
                output_tokens.append(output_token)
            return output_tokens

        else:
            max_target_length = max(target_lengths)
            all_decoder_outputs = torch.zeros((max_target_length, batch_size, self.decoder.output_dim), device=self.device)

            for t in range(max_target_length):
                use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
                if use_teacher_forcing:
                    decoder_output, decoder_hidden = self.decoder(
                        decoder_input, decoder_hidden
                    )
                    all_decoder_outputs[t] = decoder_output
                    decoder_input = target_batches[t]  
                else:
                    decoder_output, decoder_hidden = self.decoder(
                        decoder_input, decoder_hidden
                    )
                    # [batch, 1]
                    topv, topi = decoder_output.topk(1)
                    all_decoder_outputs[t] = decoder_output
                    decoder_input = topi.squeeze(1)  
     
            loss_fn = nn.NLLLoss(ignore_index=PAD_token)
            loss = loss_fn(
                all_decoder_outputs.reshape(-1,self.decoder.output_dim ),  # [batch*seq_len, output_dim]
                target_batches.reshape(-1)                                                 # [batch*seq_len]
            )
            return loss

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def train(
    model,
    data_loader, 
    optimizer, 
    clip=1, 
    teacher_forcing_ratio=0.5, 
    print_every=1  
    ):
    model.predict = False
    model.train()

    if print_every == 0:
        print_every = 1

    print_loss_total = 0  
    start = time.time()
    epoch_loss = 0
    step = 0
    for batch in tqdm(data_loader, position=0, leave=True):
        step += 1
        ids = batch.id
        input_batchs = batch.article
        target_batchs = batch.title
        input_lens = list(batch.art_len)
        target_lens = list(batch.title_len)
        
        optimizer.zero_grad()
        loss = model(input_batchs, input_lens, target_batchs, target_lens, ids, teacher_forcing_ratio)
        print_loss_total += loss.item()
        epoch_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        if step % 500 == 0:
            print_loss_avg = print_loss_total / 500
            print_loss_total = 0
            print('\tCurrent Loss: %.4f' % print_loss_avg)

    return epoch_loss / len(data_loader)

In [None]:
def evaluate(
    model,
    data_loader, 
    print_every=None
    ):
    model.predict = False
    model.eval()
    if print_every == 0:
        print_every = 1

    print_loss_total = 0  
    start = time.time()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            ids = batch.id
            input_batchs = batch.article
            target_batchs = batch.title
            input_lens = list(batch.art_len)
            target_lens = list(batch.title_len)
        

            loss = model(input_batchs, input_lens, target_batchs, target_lens, ids, teacher_forcing_ratio=0)
            print_loss_total += loss.item()
            epoch_loss += loss.item()

            if print_every and (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('\tCurrent Loss: %.4f' % print_loss_avg)

    return epoch_loss / len(data_loader)

In [None]:
def summary(
    model,
    sample, 
    idx2token=None
    ):
    model.predict = True
    model.eval()

    # shape = [seq_len, 1]
    input_batch = sample['src']
    input_len = sample['src_len']

    output_tokens = model(input_batch, input_len)
    output_tokens = [idx2token[t] for t in output_tokens]

    return " ".join(output_tokens)


In [None]:
INPUT_DIM = len(Article.vocab.stoi)
OUTPUT_DIM = len(Article.vocab.stoi)
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LEARNING_RATE = 1e-4
N_EPOCHS = 100
CLIP = 1

bidirectional = True
gcn = GCN(INPUT_DIM, ENC_EMB_DIM, HID_DIM,HID_DIM,0.5, device).to(device)
enc = Encoder(gcn, INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, device, ENC_DROPOUT, bidirectional)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, bidirectional)
model = Seq2Seq(enc, dec, device, basic_dict=Article.vocab.stoi).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
best_valid_loss = float('inf')
count = 0

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    train_loss = train(model, train_iter, optimizer, CLIP)
    valid_loss = evaluate(model, val_iter)
    end_time = time.time()
    if count == 5:
        break
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'gcn+seq2seq(graph).pt')
    else:
        count += 1

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        
    print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')

In [None]:
from rouge import Rouge

model.load_state_dict(torch.load('gcn+seq2seq(graph).pt'))

In [None]:
dct = Article.vocab.stoi
r1 = 0
r2 = 0
rl = 0
rouge = Rouge()
for i in range(10000):
    sample = test[i].article
    label = ' '.join(test[i].title)
    sample = [dct['<bos>']] + [dct[ele] for ele in sample] + [dct['<eos>']]
    test_sample = {}
    test_sample["src"] = torch.tensor(sample, dtype=torch.long, device=device).reshape(-1, 1)
    test_sample["src_len"] = [len(sample)]
    predict = ' '.join(summary(model, test_sample, Article.vocab.itos))
    score = rouge.get_scores(label, predict)[0]
    r1 += score['rouge-1']['r']
    r2 += score['rouge-2']['r']
    rl += score['rouge-l']['r']
    print(i)
    
print(r1/10000)
print(r2/10000)
print(rl/10000)