In [1]:
import pickle

import numpy as np

trainData=np.load('./train.pkl',allow_pickle=True)
valData=np.load('./valid.pkl',allow_pickle=True)
testData=np.load('./test.pkl',allow_pickle=True)
embedding=np.load('./embedding.pkl',allow_pickle=True)

In [91]:
embedding=embedding.vectors

In [115]:
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
from tensorflow.keras.utils import to_categorical

class TrainData(Dataset):
    def __init__(self,trainData,max_len=300):
        trainX=[]
        trainY=[]
        bound=[]
        padding_idx=trainData.padding
        for i in trainData:
            text=i['text']
            label=i['label']
            bod=i['sent_range']
            if(len(text)<max_len):
                text=text+[padding_idx]*(max_len-len(text))
                label=label+[padding_idx]*(max_len-len(label))
            trainX.append(text)
            label=to_categorical(label)
            trainY.append(label)
        self.trainX=trainX
        self.trainY=trainY
        
    def __len__(self):
        return len(self.trainX)
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx=idx.tolist()
        
        return np.array(self.trainX[idx],dtype=np.long),np.array(self.trainY[idx],dtype=np.float)
    
    def get_bound(self,idx,batch_size):
        return self.bound[idx*batch_size:(idx+1)*batch_size],self.extract[idx*batch_size:(idx+1)*batch_size]
    

In [86]:
import torch.nn as nn
class ExtractiveSeqTagging(nn.Module):
    def __init__(self,hidden_size,pretrain_embedding,bidirectional=True):
        super(ExtractiveSeqTagging,self).__init__()
        self.hidden_size=hidden_size
        self.bidirectional=bidirectional
        self.embedding=nn.Embedding.from_pretrained(pretrain_embedding)
        self.gru=nn.GRU(hidden_size,hidden_size,bidirectional=bidirectional)
        if bidirectional==True:
            self.out=nn.Linear(hidden_size*2,2)
    
    def forward(self,x,hidden):
        embedded=self.embedding(x).transpose(0,1)
        out,_=self.gru(embedded,hidden)
        out=self.out(out)
        return out
#     h_0 of shape (num_layers * num_directions, batch, hidden_size): 
    def initHidden(self,batch):
        shape=(1,batch,self.hidden_size)
        if self.bidirectional==True:
            shape=(2,batch,self.hidden_size)
            
        return torch.zeros(shape,device='cuda')
    def predict(self,x,hidden):
        embedded=self.embedding(x).transpose(0,1)
        
        
        out=self.gru(embedded,hidden)
        out=self.out(out)
        out=nn.functional.softmax(out)
        return out

In [78]:
from rouge_score.rouge_scorer import RougeScorer
from multiprocessing import Pool,cpu_count

def calculate_rouge_score(prediction,target):
    ROUGE_TYPES = ['rouge1', 'rouge2', 'rougeL']
    USE_STEMMER = False


    rouge_scorer = RougeScorer(ROUGE_TYPES, use_stemmer=USE_STEMMER)
    with Pool(cpu_count()) as pool:
        scores = pool.starmap(rouge_scorer.score,
                            [(t, p) for t, p in zip(target, prediction)])

    r1s = np.array([s['rouge1'].fmeasure for s in scores])
    r2s = np.array([s['rouge2'].fmeasure for s in scores])
    rls = np.array([s['rougeL'].fmeasure for s in scores])
    scores = {
        'mean': {
            'rouge-1': r1s.mean(),
            'rouge-2': r2s.mean(),
            'rouge-l': rls.mean()
        },
        'std': {
            'rouge-1': r1s.std(),
            'rouge-2': r2s.std(),
            'rouge-l': rls.std()
        },
    }
    return scores

def predict2sentence(data,pred_arr):
    target=[]
    prediction=[]
    assert len(pred_arr)==len(data)
    for p,j in zip(pred_arr,data):
        target.append(j['summary'])
        sent_bounds = {i: bound for i, bound in enumerate(j['sent_bounds'])}
        predict_sent=''
        for sent_idx in p['predict_sentence_index']:
            start, end = sent_bounds.get(sent_idx, (0, 0))
            predict_sent += j['text'][start:end]
        prediction.append(predict_sent)
    return target,prediction

In [136]:
to_categorical(trainData[12]['label'],num_classes=2)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [134]:
trainData[12]

{'id': '1000012',
 'label': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [133]:
extract_train[12]

(array([ 8470, 14513,    23,   178,  2930,    32,   349,     8,    33,
           83,    20, 17059,     5,     3,    24,    47,    83,  1197,
           55,   116,   671,     8,    62, 15435,    11,  5817,     4,
            6,  3348, 17801,    23,  5421,     7, 32085,     4,    68,
           70,   799,   150,     8,  1407,    58,    52,  5530, 15671,
           11,   285,     9,     6,  5005,     5,     3,     6, 17061,
           47,   103,     6,  2093,    47,    83,    10,    12,   462,
          219,    12,    15,   150,     5,     3,    62,   803,  2930,
          980,     8,    41,   250,    10,   245,    19,   143,  1997,
           20, 74705,     5,     3,    11,    10,  2042,    27, 74705,
          249,    11, 13420,     4,     6, 17061,  9568,     6,  2093,
           23,    95,    15,   166,    61,  7180,     8,   150,     7,
           62,   877,     5,     3,    12,   104,   108,    18,   104,
         1296,    15, 31723,    81,   104,   197,    18,   104,    33,
      

In [122]:
for idx,i in enumerate(trainLoader):
    print(idx,i[0])


0 tensor([[    6,  2177,   104,  1164,     3,  3935,    32,   678,    26,    10,
           517,  2814,    18, 12677,    97,     6,  1385,    27,    54, 14420,
         48253,     5,     3,    10, 71421,    19,   262,   273,     4,    31,
          2051, 11426,     4,    47,    83,  3040,    20,  9395,     9,  5082,
             8,   578,     4,  2613,  1326,     7,  1295,    22,  3912,  2257,
             4,  1005,   103,     5,     3, 53184,  1005,   103,     6,   273,
          1786,    11,  1005,  6191,     7,    51,    33,   591,    15,  6782,
             8,   470,    90,     5,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fc1494496a8>
Traceback (most recent call last):
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 926, in __del__
    self._shutdown_workers()
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 906, in _shutdown_workers
    w.join()
  File "/home/c4lab/miniconda3/lib/python3.7/multiprocessing/process.py", line 140, in join
    res = self._popen.wait(timeout)
  File "/home/c4lab/miniconda3/lib/python3.7/multiprocessing/popen_fork.py", line 48, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/home/c4lab/miniconda3/lib/python3.7/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt: 


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
    return default_collate([torch.as_tensor(b) for b in batch])
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 1 and 2 in dimension 2 at /pytorch/aten/src/TH/generic/THTensor.cpp:689


In [None]:
ex

In [118]:
from torch.utils.data import DataLoader
batch_size=2
extract_train=TrainData(trainData)
extract_val=TrainData(valData)
trainLoader=DataLoader(extract_train,batch_size=batch_size,num_workers=1)
valLoader=DataLoader(extract_val,batch_size=batch_size,num_workers=1)

In [105]:
#calculate positive negative ratio
positive=0
negative=0
for i in extract_train.trainY:
    
    positive+=np.sum(i)
    negative+=len(i)

negative/positive

11.633468724614135

In [108]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
data_dim=embedding.shape[1]
hidden_size=data_dim
out_feat=2

num_epoch=600
lr=0.001
pretrain=torch.Tensor(embedding)


model=ExtractiveSeqTagging(hidden_size,pretrain).cuda()
criterion=nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([11.633]).cuda())
optimizer=optim.Adam(params=model.parameters(),lr=lr)
# model=ExtractiveSeqTagging(30rameters(),lr=lr)
loss=0
best_acc=0


fo = open("loss_train_adl.txt", "w")

for epoch in range(1,num_epoch+1):
    epoch_start_time = time.time()
    train_loss=0
    val_loss=0
    pred_arr=[]
    cnt_train=0
    cnt_val=0
    model.train()
    
    for i,e in enumerate(trainLoader):
        hidden=model.initHidden(e[0].shape[0])
        optimizer.zero_grad()
        out=model(e[0].cuda(),hidden)
        loss=criterion(out.transpose(0,1),e[1].cuda())
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()
        
        result=np.argmax(out.detach().cpu(),2).T.numpy()
        
        batch_bounds,_=extract_train.get_bound(i,batch_size)
        for num,bounds in enumerate(batch_bounds):            
            prediction={}
            candidate=[]
            values=[]
            for num2,j in enumerate(bounds):
                sentence_cnt=np.sum(result[num,:][j[0]:j[1]])
                if(sentence_cnt>0):
                    candidate.append(num2)
                    values.append(sentence_cnt)
            prediction['id']=trainData[cnt_train]['id']
            
            prediction['predict_sentence_index']=[candidate[i] for i in np.argsort(values)]
            cnt_train+=1
            pred_arr.append(prediction)
        progress = ('#' * int(float(i)/len(trainLoader)*40)).ljust(40)
        print('[%03d|%03d] %2.2f sec(s) | %s |' %(epoch,num_epoch,time.time()-epoch_start_time,progress),end='\r',flush=True)
    target_word,prediction_word=predict2sentence(trainData,pred_arr)
    scores_train=calculate_rouge_score(prediction_word,target_word)
    
    pred_arr=[]
    model.eval()
    with torch.no_grad():
        for i,e in enumerate(valLoader):

            hidden=model.initHidden(e[0].shape[0])
            out=model(e[0].cuda(),hidden)
            loss=criterion(out.transpose(0,1),e[1].cuda())
            val_loss+=loss.item()
            
            result=np.argmax(out.detach().cpu(),2).T.numpy()
        
            batch_bounds,_=extract_val.get_bound(i,batch_size)
            
            for num,bounds in enumerate(batch_bounds):            
                prediction={}
                candidate=[]
                values=[]
                for num2,j in enumerate(bounds):
                    sentence_cnt=np.sum(result[num,:][j[0]:j[1]])
                    if(sentence_cnt>0):
                        candidate.append(num2)
                        values.append(sentence_cnt)
                prediction['id']=valData[cnt_val]['id']

                prediction['predict_sentence_index']=[candidate[i] for i in np.argsort(values)]
                cnt_val+=1
                pred_arr.append(prediction)
                
            progress = ('#' * int(float(i)/len(trainLoader)*40)).ljust(40)
            print('[%03d|%03d] %2.2f sec(s) | %s |' %(epoch,num_epoch,time.time()-epoch_start_time,progress),end='\r',flush=True)
    
    target_word,prediction_word=predict2sentence(valData,pred_arr)
    scores_val=calculate_rouge_score(prediction_word,target_word)
    
    fo.write('[%03d|%03d] %2.2f sec(s) | train loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f |val loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f \n' \
          %(epoch,num_epoch,time.time()-epoch_start_time,train_loss/len(trainLoader.sampler),scores_train['mean']['rouge-1']\
            ,scores_train['mean']['rouge-2'],scores_train['mean']['rouge-l'],val_loss/len(valLoader.sampler),\
            scores_val['mean']['rouge-1'],scores_val['mean']['rouge-2'],scores_val['mean']['rouge-l'] \
            ))
    
    print('[%03d|%03d] %2.2f sec(s) | train loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f |val loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f' \
          %(epoch,num_epoch,time.time()-epoch_start_time,train_loss/len(trainLoader.sampler),scores_train['mean']['rouge-1']\
            ,scores_train['mean']['rouge-2'],scores_train['mean']['rouge-l'],val_loss/len(valLoader.sampler),\
            scores_val['mean']['rouge-1'],scores_val['mean']['rouge-2'],scores_val['mean']['rouge-l'] \
            ))
    
fo.close()
    
    


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
    return default_collate([torch.as_tensor(b) for b in batch])
  File "/home/c4lab/miniconda3/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 2 and 1 in dimension 2 at /pytorch/aten/src/TH/generic/THTensor.cpp:689


In [90]:
embedding.vectors

tensor([[-0.5494, -0.9779, -0.4602,  ...,  0.0237, -0.2096,  0.3666],
        [ 0.6949,  0.9781,  0.8065,  ..., -0.1200, -0.3407, -0.6864],
        [ 0.0671, -0.2315, -0.0094,  ..., -0.3814, -0.0147, -0.0139],
        ...,
        [-0.5142, -0.2361, -0.0353,  ..., -0.2072, -0.6076,  1.0162],
        [ 0.2020, -0.5053,  0.1782,  ..., -0.0385,  0.1093, -0.1146],
        [ 0.2228, -0.2964,  0.6941,  ...,  0.0939, -0.1655, -0.3459]])