In [None]:
#process 32195

In [3]:
import pickle

import numpy as np

trainData=np.load('./train.pkl',allow_pickle=True)
valData=np.load('./valid.pkl',allow_pickle=True)
testData=np.load('./test.pkl',allow_pickle=True)
embedding=np.load('./embedding.pkl',allow_pickle=True)
embedding=embedding.vectors

test_remove=[7421, 10381, 10606, 11367, 13617, 14054, 15613, 15755, 17445]
train_remove=[10318, 13123, 15367, 19945, 22458, 22993, 28541, 32094, 35147, 36770, 43614, 46068, 49182, 53332, 54932, 65569, 68450, 68494, 69867, 70103, 70858]
val_remove=[1147, 7085, 8520, 10125]


from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
from tensorflow.keras.utils import to_categorical

class TrainData(Dataset):
    def __init__(self,trainData,remove_arr,max_len=300):
        trainX=[]
        trainY=[]
        bound=[]
        padding_idx=trainData.padding
        for idx,i in enumerate(trainData):
            if idx in remove_arr:
                continue
            text=i['text']
            label=i['label']
            
            if(len(text)<max_len):
                text=text+[padding_idx]*(max_len-len(text))
                label=label+[padding_idx]*(max_len-len(label))
            trainX.append(text)
            label=to_categorical(label,num_classes=2)
            trainY.append(label)
            bound.append(i['sent_range'])
        self.trainX=trainX
        self.trainY=trainY
        self.bound=bound
    def __len__(self):
        return len(self.trainX)
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx=idx.tolist()
        
        return np.array(self.trainX[idx],dtype=np.long),np.array(self.trainY[idx],dtype=np.float)
    
    def get_bound(self,idx,batch_size):
        return self.bound[idx*batch_size:(idx+1)*batch_size]
    

    
from rouge_score.rouge_scorer import RougeScorer
from multiprocessing import Pool,cpu_count

def calculate_rouge_score(prediction,target):
    ROUGE_TYPES = ['rouge1', 'rouge2', 'rougeL']
    USE_STEMMER = False


    rouge_scorer = RougeScorer(ROUGE_TYPES, use_stemmer=USE_STEMMER)
    with Pool(cpu_count()) as pool:
        scores = pool.starmap(rouge_scorer.score,
                            [(t, p) for t, p in zip(target, prediction)])

    r1s = np.array([s['rouge1'].fmeasure for s in scores])
    r2s = np.array([s['rouge2'].fmeasure for s in scores])
    rls = np.array([s['rougeL'].fmeasure for s in scores])
    scores = {
        'mean': {
            'rouge-1': r1s.mean(),
            'rouge-2': r2s.mean(),
            'rouge-l': rls.mean()
        },
        'std': {
            'rouge-1': r1s.std(),
            'rouge-2': r2s.std(),
            'rouge-l': rls.std()
        },
    }
    return scores

def predict2sentence(data,pred_arr):
    target=[]
    prediction=[]
    assert len(pred_arr)==len(data)
    for p,j in zip(pred_arr,data):
        target.append(j['summary'])
        sent_bounds = {i: bound for i, bound in enumerate(j['sent_bounds'])}
        predict_sent=''
        for sent_idx in p['predict_sentence_index']:
            start, end = sent_bounds.get(sent_idx, (0, 0))
            predict_sent += j['text'][start:end]
        prediction.append(predict_sent)
    return target,prediction

import json
def load_json_data(file_name):
    data=[]
    with open(file_name,'r') as r:
        line=r.readline()
        while line:
            data.append(json.loads(line))
            line=r.readline()
    return data
def remove_dead_entry(data):
    tmp=[]
    for index,i in enumerate(data):
        if(i['text']=='\n' or i['text']==''):
            tmp.append(index)
            
    for index in sorted(tmp, reverse=True):
        del data[index]
    print('remove entry:',tmp)
    return data
def write_revise_data(data,file_name):
    with open(file_name,'w') as w:
        for idx,i in enumerate(data):
            if(idx==len(data)-1):
                word=json.dumps(i)
            else:
                word=json.dumps(i)+'\n'
            w.write(word)


In [16]:
import torch.nn as nn
class ExtractiveSeqTagging(nn.Module):
    def __init__(self,hidden_size,pretrain_embedding,bidirectional=True):
        super(ExtractiveSeqTagging,self).__init__()
        self.hidden_size=hidden_size
        self.bidirectional=bidirectional
        self.embedding=nn.Embedding.from_pretrained(pretrain_embedding)
        self.gru=nn.LSTM(hidden_size,hidden_size,bidirectional=bidirectional)
        if bidirectional==True:
            self.out=nn.Linear(hidden_size*2,2)
    
    def forward(self,x,hidden):
        embedded=self.embedding(x).transpose(0,1)
        out,_=self.gru(embedded,hidden)
        out=self.out(out)
        return out
#     h_0 of shape (num_layers * num_directions, batch, hidden_size): 
    def initHidden(self,batch):
        shape=(1,batch,self.hidden_size)
        if self.bidirectional==True:
            shape=(2,batch,self.hidden_size)
            
        return torch.zeros(shape,device='cuda')
    def predict(self,x,hidden):
        embedded=self.embedding(x).transpose(0,1)
        
        
        out,_=self.gru(embedded,hidden)
        out=self.out(out)
        out=nn.functional.softmax(out,2)
        return out
    

In [None]:
revise_testData=load_json_data('../../data/test_revise.jsonl')
revise_trainData=load_json_data('../../data/train_revise.jsonl')
revise_valData=load_json_data('../../data/val_revise.jsonl')


In [4]:
from torch.utils.data import DataLoader
batch_size=64
extract_train=TrainData(trainData,train_remove)
extract_val=TrainData(valData,val_remove)
trainLoader=DataLoader(extract_train,batch_size=batch_size,num_workers=1)
valLoader=DataLoader(extract_val,batch_size=batch_size,num_workers=1)

In [14]:
np.argsort([3,5,2])[]

array([2, 0, 1])

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
data_dim=embedding.shape[1]
hidden_size=data_dim
out_feat=2

num_epoch=50
lr=0.001
pretrain=torch.Tensor(embedding)


model=ExtractiveSeqTagging(hidden_size,pretrain).cuda()
criterion=nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([11.63]).cuda())
optimizer=optim.Adam(params=model.parameters(),lr=lr)
# model=ExtractiveSeqTagging(30rameters(),lr=lr)
loss=0
best_acc=0


fo = open("loss_train_adl_64_LSTMHID_1.txt", "w")

for epoch in range(1,num_epoch+1):
    epoch_start_time = time.time()
    train_loss=0
    val_loss=0
    pred_arr=[]
    cnt_train=0
    cnt_val=0
    model.train()
    
    for i,e in enumerate(trainLoader):
        hidden=model.initHidden(e[0].shape[0])
        optimizer.zero_grad()
        out=model(e[0].cuda(),(hidden,hidden))
        loss=criterion(out.transpose(0,1),e[1].cuda())
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()
        
        result=np.argmax(out.detach().cpu(),2).T.numpy()
        batch_bounds=extract_train.get_bound(i,batch_size)
        for num,bounds in enumerate(batch_bounds):            
            prediction={}
            candidate=[]
            values=[]
            for num2,j in enumerate(bounds):
                sentence_cnt=np.sum(result[num,:][j[0]:j[1]])
                if(sentence_cnt>0):
                    candidate.append(num2)
                    values.append(sentence_cnt)
            values=np.array(values)
            candidate=np.array(candidate)
            prediction['id']=trainData[cnt_train]['id']
            
            #post processing 設定 threshold
            prediction['predict_sentence_index']=[candidate[i] for i in np.argsort(-values)]
            
            cnt_train+=1
            pred_arr.append(prediction)
        progress = ('#' * int(float(i)/len(trainLoader)*40)).ljust(40)
        print('[%03d|%03d] %2.2f sec(s) | %s |' %(epoch,num_epoch,time.time()-epoch_start_time,progress),end='\r',flush=True)
    target_word,prediction_word=predict2sentence(revise_trainData,pred_arr)
    scores_train=calculate_rouge_score(prediction_word,target_word)
    
    pred_arr=[]
    model.eval()
    with torch.no_grad():
        for i,e in enumerate(valLoader):

            hidden=model.initHidden(e[0].shape[0])
            out=model(e[0].cuda(),(hidden,hidden))
            loss=criterion(out.transpose(0,1),e[1].cuda())
            val_loss+=loss.item()
            
            result=np.argmax(out.detach().cpu(),2).T.numpy()
        
            batch_bounds=extract_val.get_bound(i,batch_size)
            
            for num,bounds in enumerate(batch_bounds):            
                prediction={}
                candidate=[]
                values=[]
                for num2,j in enumerate(bounds):
                    sentence_cnt=np.sum(result[num,:][j[0]:j[1]])
                    if(sentence_cnt>0):
                        candidate.append(num2)
                        values.append(sentence_cnt)
                prediction['id']=valData[cnt_val]['id']
                
                values=np.array(values)
                candidate=np.array(candidate)
                
                prediction['predict_sentence_index']=[candidate[i] for i in np.argsort(-values)]
                cnt_val+=1
                pred_arr.append(prediction)
                
            progress = ('#' * int(float(i)/len(valLoader)*40)).ljust(40)
            print('[%03d|%03d] %2.2f sec(s) | %s |' %(epoch,num_epoch,time.time()-epoch_start_time,progress),end='\r',flush=True)
    
    target_word,prediction_word=predict2sentence(revise_valData,pred_arr)
    scores_val=calculate_rouge_score(prediction_word,target_word)
    
    fo.write('[%03d|%03d] %2.2f sec(s) | train loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f |val loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f \n' \
          %(epoch,num_epoch,time.time()-epoch_start_time,train_loss/len(trainLoader.sampler),scores_train['mean']['rouge-1']\
            ,scores_train['mean']['rouge-2'],scores_train['mean']['rouge-l'],val_loss/len(valLoader.sampler),\
            scores_val['mean']['rouge-1'],scores_val['mean']['rouge-2'],scores_val['mean']['rouge-l'] \
            ))
    
    print('[%03d|%03d] %2.2f sec(s) | train loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f |val loss: %2.5f | rouge-1: %2.4f | rouge-2: %2.4f |rouge-l: %2.4f' \
          %(epoch,num_epoch,time.time()-epoch_start_time,train_loss/len(trainLoader.sampler),scores_train['mean']['rouge-1']\
            ,scores_train['mean']['rouge-2'],scores_train['mean']['rouge-l'],val_loss/len(valLoader.sampler),\
            scores_val['mean']['rouge-1'],scores_val['mean']['rouge-2'],scores_val['mean']['rouge-l'] \
            ))
    if best_acc<scores_val['mean']['rouge-1']:
        best_acc=scores_val['mean']['rouge-1']
        torch.save(model.state_dict(), './state_dict/model'+str(epoch)+'.pth')
        
fo.close()
    
    


[001|050] 142.78 sec(s) | train loss: 0.01482 | rouge-1: 0.0052 | rouge-2: 0.0009 |rouge-l: 0.0038 |val loss: 0.01518 | rouge-1: 0.0157 | rouge-2: 0.0028 |rouge-l: 0.0117
[002|050] 145.31 sec(s) | train loss: 0.01535 | rouge-1: 0.0196 | rouge-2: 0.0036 |rouge-l: 0.0145 |val loss: 0.01566 | rouge-1: 0.0244 | rouge-2: 0.0045 |rouge-l: 0.0183
[003|050] 145.29 sec(s) | train loss: 0.01580 | rouge-1: 0.0289 | rouge-2: 0.0051 |rouge-l: 0.0214 |val loss: 0.01599 | rouge-1: 0.0312 | rouge-2: 0.0057 |rouge-l: 0.0233
[004|050] 22.58 sec(s) | #######                                  |

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[013|050] 149.64 sec(s) | train loss: 0.01852 | rouge-1: 0.2129 | rouge-2: 0.0417 |rouge-l: 0.1481 |val loss: 0.02885 | rouge-1: 0.1790 | rouge-2: 0.0281 |rouge-l: 0.1257
[014|050] 144.67 sec(s) | train loss: 0.01915 | rouge-1: 0.2203 | rouge-2: 0.0438 |rouge-l: 0.1529 |val loss: 0.03221 | rouge-1: 0.1746 | rouge-2: 0.0270 |rouge-l: 0.1229
[015|050] 75.70 sec(s) | ########################                 |

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[024|050] 149.97 sec(s) | train loss: 0.02589 | rouge-1: 0.2465 | rouge-2: 0.0518 |rouge-l: 0.1718 |val loss: 0.05433 | rouge-1: 0.1752 | rouge-2: 0.0271 |rouge-l: 0.1232
[025|050] 149.67 sec(s) | train loss: 0.02643 | rouge-1: 0.2472 | rouge-2: 0.0519 |rouge-l: 0.1723 |val loss: 0.05699 | rouge-1: 0.1759 | rouge-2: 0.0272 |rouge-l: 0.1240
[026|050] 138.38 sec(s) | ######                                   |

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[036|050] 149.07 sec(s) | train loss: 0.02985 | rouge-1: 0.2532 | rouge-2: 0.0538 |rouge-l: 0.1777 |val loss: 0.06631 | rouge-1: 0.1768 | rouge-2: 0.0270 |rouge-l: 0.1237
[037|050] 149.36 sec(s) | train loss: 0.03013 | rouge-1: 0.2537 | rouge-2: 0.0541 |rouge-l: 0.1782 |val loss: 0.06666 | rouge-1: 0.1766 | rouge-2: 0.0274 |rouge-l: 0.1237
[038|050] 29.99 sec(s) | #########                                |

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[047|050] 149.19 sec(s) | train loss: 0.03217 | rouge-1: 0.2550 | rouge-2: 0.0546 |rouge-l: 0.1795 |val loss: 0.06819 | rouge-1: 0.1748 | rouge-2: 0.0268 |rouge-l: 0.1226
[048|050] 140.71 sec(s) | train loss: 0.03209 | rouge-1: 0.2558 | rouge-2: 0.0548 |rouge-l: 0.1801 |val loss: 0.07069 | rouge-1: 0.1746 | rouge-2: 0.0270 |rouge-l: 0.1227
[049|050] 45.94 sec(s) | ##############################           |

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
! mkdir state_dict

In [None]:
from torch.utils.data import DataLoader
batch_size=64
extract_train=TrainData(trainData,train_remove)
extract_val=TrainData(valData,val_remove)
trainLoader=DataLoader(extract_train,batch_size=batch_size,num_workers=1)
valLoader=DataLoader(extract_val,batch_size=batch_size,num_workers=1)

In [5]:
data_dim=embedding.shape[1]
hidden_size=data_dim

In [18]:
model=ExtractiveSeqTagging(hidden_size,embedding).cuda()
model.load_state_dict(torch.load('./state_dict/model30.pth'))

model.eval()

pred=[]


for data in valLoader:
    hidden=model.initHidden(data[0].shape[0])
    pred.append(model.predict(data[0].cuda(),(hidden,hidden)).cpu().detach().numpy())


In [40]:
pred_arr=[]
cnt_val=0
for idx,i in enumerate(pred):
    result=np.argmax(i,2).T
    batch_bounds=extract_val.get_bound(idx,batch_size)
    for num,bounds in enumerate(batch_bounds):            
        prediction={}
        candidate=[]
        values=[]
        for num2,j in enumerate(bounds):
            sentence_cnt=np.sum(result[num,:][j[0]:j[1]])
            if(sentence_cnt>0):
                candidate.append(num2)
                values.append(sentence_cnt)
        values=np.array(values)
        candidate=np.array(candidate)
        prediction['id']=valData[cnt_val]['id']

        #post processing 設定 threshold
        prediction['predict_sentence_index']=[candidate[i] for i in np.argsort(-values)]

        cnt_val+=1
        pred_arr.append(prediction)

In [48]:
def predict2sentence(data,pred_arr):
    target=[]
    prediction=[]
    assert len(pred_arr)==len(data)
    for p,j in zip(pred_arr,data):
        target.append(j['summary'])
        sent_bounds = {i: bound for i, bound in enumerate(j['sent_bounds'])}
        predict_sent=''
        
        for idx,sent_idx in enumerate(p['predict_sentence_index']):
            start, end = sent_bounds.get(sent_idx, (0, 0))
            predict_sent += j['text'][start:end]
            if(idx==1):
                break
        prediction.append(predict_sent)
    return target,prediction

In [50]:
target_word,prediction_word=predict2sentence(revise_valData,pred_arr)
scores_train=calculate_rouge_score(prediction_word,target_word)

In [47]:
pred_arr

[{'id': '2000000', 'predict_sentence_index': [10]},
 {'id': '2000001', 'predict_sentence_index': [7]},
 {'id': '2000002', 'predict_sentence_index': [0, 5, 6]},
 {'id': '2000003', 'predict_sentence_index': [4, 6]},
 {'id': '2000004', 'predict_sentence_index': [1]},
 {'id': '2000005', 'predict_sentence_index': [8]},
 {'id': '2000006', 'predict_sentence_index': [2]},
 {'id': '2000007', 'predict_sentence_index': [5, 19, 2, 10, 0]},
 {'id': '2000008', 'predict_sentence_index': [4, 2]},
 {'id': '2000009', 'predict_sentence_index': [3]},
 {'id': '2000010', 'predict_sentence_index': [3, 0, 1, 6]},
 {'id': '2000011', 'predict_sentence_index': [1]},
 {'id': '2000012', 'predict_sentence_index': [0]},
 {'id': '2000013', 'predict_sentence_index': [9, 0, 8]},
 {'id': '2000014', 'predict_sentence_index': [3]},
 {'id': '2000015', 'predict_sentence_index': [2, 1]},
 {'id': '2000016', 'predict_sentence_index': [7, 1]},
 {'id': '2000017', 'predict_sentence_index': [0]},
 {'id': '2000018', 'predict_senten

In [None]:
result=np.argmax(out.detach().cpu(),2).T.numpy()
        batch_bounds=extract_train.get_bound(i,batch_size)
        for num,bounds in enumerate(batch_bounds):            
            prediction={}
            candidate=[]
            values=[]
            for num2,j in enumerate(bounds):
                sentence_cnt=np.sum(result[num,:][j[0]:j[1]])
                if(sentence_cnt>0):
                    candidate.append(num2)
                    values.append(sentence_cnt)
            values=np.array(values)
            candidate=np.array(candidate)
            prediction['id']=trainData[cnt_train]['id']
            
            #post processing 設定 threshold
            prediction['predict_sentence_index']=[candidate[i] for i in np.argsort(-values)]
            
            cnt_train+=1
            pred_arr.append(prediction)
        progress = ('#' * int(float(i)/len(trainLoader)*40)).ljust(40)
        print('[%03d|%03d] %2.2f sec(s) | %s |' %(epoch,num_epoch,time.time()-epoch_start_time,progress),end='\r',flush=True)
    target_word,prediction_word=predict2sentence(revise_trainData,pred_arr)
    scores_train=calculate_rouge_score(prediction_word,target_word)