# Utils folder

# Vocab.py

In [None]:
import torch

class Vocab():
    def __init__(self,embed,word2id):
        self.embed = embed      #given embeddings
        self.word2id = word2id  #in form word:id
        self.id2word = {v:k for k,v in word2id.items()} #id:word
        assert len(self.word2id) == len(self.id2word) #this condition always should be true
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'

    def __len__(self): #returns length of the words
        return len(word2id)

    def i2w(self,idx): #returns words when the index is given
        return self.id2word[idx]
    def w2i(self,w):
        if w in self.word2id: #when a word is given  return its index
            return self.word2id[w]
        else:
            return self.UNK_IDX #if word doesn't exist return 1(UNK_IDX)

    def make_features(self,batch,sent_trunc=50,doc_trunc=100,split_token='\n'): #max sent and word lengths
        sents_list,targets,doc_lens = [],[],[]
        # trunc document
        for doc,label in zip(batch['doc'],batch['labels']): #zip clubs corresponding inputs (doc,label) here
            sents = doc.split(split_token) #sentencification of article
            labels = label.split(split_token)
            labels = [int(l) for l in labels]
            max_sent_num = min(doc_trunc,len(sents)) #getting max size of sentences
            sents = sents[:max_sent_num] #truncating sentences if greater than fixed size
            labels = labels[:max_sent_num] #truncating labels if greater then fixed size
            sents_list += sents #big list of all sentencified articles
            targets += labels #big list of all labels of sentencified articles
            doc_lens.append(len(sents)) #collecting all doc lengths
        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc] #truncating the words in a sentences to fixed length
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words) #it have all the tokenized sentences list

        features = []
        for sent in batch_sents:
            feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len-len(sent))] #features have indices of all the words of s sentence
            features.append(feature) #it have features of all the tokenized sentences(list)

        features = torch.LongTensor(features)    #changing features(numbers) to tensors
        targets = torch.LongTensor(targets) #changing labels to tensors
        summaries = batch['summaries'] #batch is given as input during the function call and it have summaries in it

        return features,targets,summaries,doc_lens

    def make_predict_features(self, batch, sent_trunc=150, doc_trunc=100, split_token='. '):
        sents_list, doc_lens = [],[]
        for doc in batch: #from the passed batch taking each doc or article
            sents = doc.split(split_token) #sentencification is done using '.'(fullstop)
            max_sent_num = min(doc_trunc,len(sents))  #getting maximum len among doc_truc and len(sentence)
            sents = sents[:max_sent_num]  #trucating article (having limited number of sentences in the article)
            sents_list += sents
            doc_lens.append(len(sents)) #appending the length of each sentence
        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list: #similarly taking each sentence from the sentence list and further it is truncated based on the number of words to be in each sentence
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words) #it is then appended to another biglist

        features = []
        for sent in batch_sents:  #features are taken for the predicted output
            feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len-len(sent))]
            features.append(feature)

        features = torch.LongTensor(features)

        return features, doc_lens

# Dataset.py

In [None]:
import csv
import torch
import torch.utils.data as data #Dataset stores the samples and their corresponding labels can be existing or user data
from torch.autograd import Variable
#from .Vocab import Vocab      #importing the above vocab class
import numpy as np

class Dataset(data.Dataset): #this is the dataset in torch.utils.data
    def __init__(self, examples):
        super(Dataset,self).__init__()      #
        # data: {'sents':xxxx,'labels':'xxxx', 'summaries':[1,0]}
        self.examples = examples
        self.training = False
    def train(self):
        self.training = True #enabling training
        return self
    def test(self):
        self.training = False #disabling training while testing
        return self
    def shuffle(self,words):
        np.random.shuffle(words)    #order of the sub-arrays changes but their content remains the same
        return ' '.join(words)      #here all the words will be in a string with a space as seperator
    def dropout(self,words,p=0.3):
        l = len(words)
        drop_index = np.random.choice(l,int(l*p)) #we can get the random samples of one dimensional array and return the random samples of numpy array, here l-1D array & int(l*p) is return size
        keep_words = [words[i] for i in range(l) if i not in drop_index] #removing some samples from the whole set of words
        return ' '.join(keep_words) #returning this keep_words as a string with seperator space
    def __getitem__(self, idx):
        ex = self.examples[idx] #with given index getting the word at that index
        return ex
        #words = ex['sents'].split()
        #guess = np.random.random()

        #if self.training:
        #    if guess > 0.5:
        #        sents = self.dropout(words,p=0.3)
        #    else:
        #        sents = self.shuffle(words)
        #else:
        #    sents = ex['sents']
        #return {'id':ex['id'],'sents':sents,'labels':ex['labels']}

    def __len__(self):
        return len(self.examples) #returning the length of examples that is passed during the declaration of the class object

In [None]:
#from .Dataset import Dataset
#from .Vocab import Vocab

# Models folder

# Model.py

In [None]:
import torch
from torch.autograd import Variable #used to create variable that provides backward method in backpropogation Eg.stores loss values and cals gradient in backprop.
class BasicModule(torch.nn.Module): #torch.nn.Module : It is a base class used to develop all neural network models. Inheritance

    def __init__(self, args):
        super(BasicModule,self).__init__() #in super() the first parameter is the subclass, and the second parameter is an object that is an instance of that subclass.
        self.args = args
        self.model_name = str(type(self))   #

    def pad_doc(self,words_out,doc_lens): # words_out is tensor in pytorch
        pad_dim = words_out.size(1)       #tensors size of 1D at 2nd position i.e., at index 1
        max_doc_len = max(doc_lens)       #
        sent_input = []
        start = 0
        for doc_len in doc_lens:
            stop = start + doc_len
            valid = words_out[start:stop]     #considering article wise                                  # (doc_len,2*H)
            start = stop
            if doc_len == max_doc_len:
                sent_input.append(valid.unsqueeze(0)) #unsqueeze changes 2D to 3D tensor eg.[[1,2],[3,4]] to [[[1,2],[3,4]]]
            else:
                pad = Variable(torch.zeros(max_doc_len-doc_len,pad_dim)) #torch.zeros(2,3) gives two rows and three columns filled with zeroes
                if self.args.device is not None:
                    pad = pad.cuda()        #
                sent_input.append(torch.cat([valid,pad]).unsqueeze(0))    #concatenation of eg.t1=[[1,2],[3,4]], t2=[[5,6],[7,8]] cat--> [[1,2],[3,4],[5,6],[7,8]] # (1,max_len,2*H)
        sent_input = torch.cat(sent_input,dim=0)  #concatinating all the (unsqeezed) tensors                              # (B,max_len,2*H)
        return sent_input

    def save(self):
        checkpoint = {'model':self.state_dict(), 'args': self.args}         #state_dict maps to models_parameters that are in torch.nn.module , state_dict is a dictionary that has learnable parameter which can be modified as it is dict
        best_path = '%s%s_seed_%d.pt' % (self.args.save_dir,self.model_name,self.args.seed)
        torch.save(checkpoint,best_path) #saves an object to a disk file

        return best_path

    def load(self, best_path):
        if self.args.device is not None:
            data = torch.load(best_path)['model'] #load(file like obj) loding model(key) from the file like object
        else:
            data = torch.load(best_path, map_location=lambda storage, loc: storage)['model'] #if device is none then gettng it from storage
        self.load_state_dict(data)    #here data is dictionary (checkpoint in above funtion) is taken out using load_state_dict
        if self.args.device is not None:
            return self.cuda() #CUDA is a parallel computing platform and programming model that enables dramatic increases in computing performance
        else:
            return self

In [None]:
#from .BasicModule import BasicModule

# proprocessing and main program files

# Preprocessor.py

In [None]:
import argparse
import json
import numpy as np
from collections import OrderedDict
from glob import glob
from time import time
from multiprocessing import Pool,cpu_count
from itertools import chain

def build_vocab(args):
    print('start building vocab')

    PAD_IDX = 0
    UNK_IDX = 1
    PAD_TOKEN = 'PAD_TOKEN'
    UNK_TOKEN = 'UNK_TOKEN'

    f = open(args.embed) #it is given in args
    embed_dim = int(next(f).split()[1])

    word2id = OrderedDict() #OrderedDict preserves the order in which the keys are inserted

    word2id[PAD_TOKEN] = PAD_IDX
    word2id[UNK_TOKEN] = UNK_IDX

    embed_list = []
    # fill PAD and UNK vector
    embed_list.append([0 for _ in range(embed_dim)]) #[[0,0,.....]]
    embed_list.append([0 for _ in range(embed_dim)]) #[[0,0,.....],[0,0,.....]]

    # build Vocab
    for line in f:
        tokens = line.split() #This is not 1D
        word = tokens[:-1*embed_dim][0] #taking reverse of the 0th positional elements
        vector = [float(num) for num in tokens[-1*embed_dim:]]  #here vector is floating number format of words
        embed_list.append(vector)   #[[0,0,.....],[0,0,.....],[vector],[vector],....]
        word2id[word] = len(word2id) #giving words as keys and their length as value to the dictionary word2id
    f.close()
    embed = np.array(embed_list,dtype=np.float32) #creates an array
    np.savez_compressed(file=args.vocab, embedding=embed) #Save several arrays into a single file in compressed .npz format
    with open(args.word2id,'w') as f:
        json.dump(word2id,f)    #converts the Python objects into appropriate json objects and writing them into the file word2id.json file

def worker(files):
    examples = []
    for f in files:
        parts = open(f,encoding='latin-1').read().split('\n\n')
        try:
            entities = { line.strip().split(':')[0]:line.strip().split(':')[1].lower() for line in parts[-1].split('\n')}
        except:
            continue
        sents,labels,summaries = [],[],[]
        # content
        for line in parts[1].strip().split('\n'):
            content, label = line.split('\t\t\t')
            tokens = content.strip().split()
            for i,token in enumerate(tokens):
                if token in entities:
                    tokens[i] = entities[token]
            label = '1' if label == '1' else '0'
            sents.append(' '.join(tokens))
            labels.append(label)
        # summary
        for line in parts[2].strip().split('\n'):
            tokens = line.strip().split()
            for i, token in enumerate(tokens):
                if token in entities:
                    tokens[i] = entities[token]
            line = ' '.join(tokens).replace('*','')
            summaries.append(line)
        ex = {'doc':'\n'.join(sents),'labels':'\n'.join(labels),'summaries':'\n'.join(summaries)}
        examples.append(ex)
    return examples

def build_dataset(args):
    t1 = time()

    print('start building dataset')
    if args.worker_num == 1 and cpu_count() > 1:
        print('[INFO] There are %d CPUs in your device, please increase -worker_num to speed up' % (cpu_count()))
        print("       It's a IO intensive application, so 2~10 may be a good choise")

    files = glob(args.source_dir)
    data_num = len(files)
    group_size = data_num // args.worker_num
    groups = []
    for i in range(args.worker_num):
        if i == args.worker_num - 1:
            groups.append(files[i*group_size : ])
        else:
            groups.append(files[i*group_size : (i+1)*group_size])
    p = Pool(processes=args.worker_num)
    multi_res = [p.apply_async(worker,(fs,)) for fs in groups]
    res = [res.get() for res in multi_res]

    with open(args.target_dir, 'w') as f:
        for row in chain(*res):
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    t2 = time()
    print('Time Cost : %.1f seconds' % (t2 - t1))
if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('-build_vocab',action='store_true')
    parser.add_argument('-embed', type=str, default='data/100.w2v')
    parser.add_argument('-vocab', type=str, default='data/embedding.npz')
    parser.add_argument('-word2id',type=str,default='data/word2id.json')

    parser.add_argument('-worker_num',type=int,default=1)
    parser.add_argument('-source_dir', type=str, default='data/neuralsum/dailymail/validation/*')
    parser.add_argument('-target_dir', type=str, default='data/val.json')

    args = parser.parse_args()

    if args.build_vocab:
        build_vocab(args)
    else:
        build_dataset(args)

usage: colab_kernel_launcher.py [-h] [-build_vocab] [-embed EMBED] [-vocab VOCAB]
                                [-word2id WORD2ID] [-worker_num WORKER_NUM]
                                [-source_dir SOURCE_DIR] [-target_dir TARGET_DIR]
colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-9bd0326e-fc4e-41aa-b5f8-57c2fb671dc3.json


SystemExit: 2

# Main.py

In [None]:
import json
import models
import utils
import argparse,random,logging,numpy,os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm
from time import time
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
parser = argparse.ArgumentParser(description='extractive summary')
# model
parser.add_argument('-save_dir',type=str,default='checkpoints/')
parser.add_argument('-embed_dim',type=int,default=100)
parser.add_argument('-embed_num',type=int,default=100)
parser.add_argument('-pos_dim',type=int,default=50)
parser.add_argument('-pos_num',type=int,default=100)
parser.add_argument('-seg_num',type=int,default=10)
parser.add_argument('-kernel_num',type=int,default=100)
parser.add_argument('-kernel_sizes',type=str,default='3,4,5')
parser.add_argument('-model',type=str,default='RNN_RNN')
parser.add_argument('-hidden_size',type=int,default=200)
# train
parser.add_argument('-lr',type=float,default=1e-3)
parser.add_argument('-batch_size',type=int,default=32)
parser.add_argument('-epochs',type=int,default=5)
parser.add_argument('-seed',type=int,default=1)
parser.add_argument('-train_dir',type=str,default='data/train.json')
parser.add_argument('-val_dir',type=str,default='data/val.json')
parser.add_argument('-embedding',type=str,default='data/embedding.npz')
parser.add_argument('-word2id',type=str,default='data/word2id.json')
parser.add_argument('-report_every',type=int,default=1500)
parser.add_argument('-seq_trunc',type=int,default=50)
parser.add_argument('-max_norm',type=float,default=1.0)
# test
parser.add_argument('-load_dir',type=str,default='checkpoints/RNN_RNN_seed_1.pt')
parser.add_argument('-test_dir',type=str,default='data/test.json')
parser.add_argument('-ref',type=str,default='outputs/ref')
parser.add_argument('-hyp',type=str,default='outputs/hyp')
parser.add_argument('-filename',type=str,default='x.txt') # TextFile to be summarized
parser.add_argument('-topk',type=int,default=15)
# device
parser.add_argument('-device',type=int)
# option
parser.add_argument('-test',action='store_true')
parser.add_argument('-debug',action='store_true')
parser.add_argument('-predict',action='store_true')
args = parser.parse_args()
use_gpu = args.device is not None

if torch.cuda.is_available() and not use_gpu:
    print("WARNING: You have a CUDA device, should run with -device 0")

# set cuda device and seed
if use_gpu:
    torch.cuda.set_device(args.device)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
random.seed(args.seed)
numpy.random.seed(args.seed)

def eval(net,vocab,data_iter,criterion): #vocab class,data_iter(train or valid ITER), criterion=NN.bceloss
    net.eval()
    total_loss = 0
    batch_num = 0
    for batch in data_iter:
        features,targets,_,doc_lens = vocab.make_features(batch)
        features,targets = Variable(features), Variable(targets.float())
        if use_gpu:
            features = features.cuda()
            targets = targets.cuda()
        probs = net(features,doc_lens)
        loss = criterion(probs,targets)
        total_loss += loss.data[0]
        batch_num += 1
    loss = total_loss / batch_num
    net.train()
    return loss

def train():
    logging.info('Loading vocab,train and val dataset.Wait a second,please')

    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id) #passing word2id and embedding files to vocab class in utils folder

    with open(args.train_dir) as f:
        examples = [json.loads(line) for line in f]
    train_dataset = utils.Dataset(examples) #passing training data to dataset class in utils folder

    with open(args.val_dir) as f:
        examples = [json.loads(line) for line in f]
    val_dataset = utils.Dataset(examples) #passing valid dataset to dataset class in utils folder

    # update args
    args.embed_num = embed.size(0)
    args.embed_dim = embed.size(1)
    args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')] #taking kernel sizes as list [3,4,5]
    # build model
    net = getattr(models,args.model)(args,embed)
#The getattr() method returns the value of the named attribute of an object. If not found, it returns the default value provided to the function.
    if use_gpu:
        net.cuda()
    # load dataset
#The DataLoader class is designed so that it can be iterated using the enumerate() function, which returns a tuple with the current batch zero-based index value, and the actual batch of data.
    train_iter = DataLoader(dataset=train_dataset,
            batch_size=args.batch_size,
            shuffle=True)
    val_iter = DataLoader(dataset=val_dataset,
            batch_size=args.batch_size,
            shuffle=False)
    # loss function
    criterion = nn.BCELoss()
#binary cross entropy
#BCELoss creates a criterion that measures the Binary Cross Entropy between the target and the output. You can read more about BCELoss here. If we use BCELoss function we need to have a sigmoid layer in our network.
#Also called Softmax Loss. It is a Softmax activation plus a Cross-Entropy loss. If we use this loss, we will train a CNN to output a probability over the C classes for each image. It is used for multi-class classification.

    # model info
    print(net)
    params = sum(p.numel() for p in list(net.parameters())) / 1e6
    print('#Params: %.1fM' % (params))

    min_loss = float('inf')
#float('inf') is used for setting a variable with an infinitely large value. In simple words, it sets the value as +ve infinty.
    optimizer = torch.optim.Adam(net.parameters(),lr=args.lr)
    net.train()

    t1 = time()
    for epoch in range(1,args.epochs+1):
        for i,batch in enumerate(train_iter): #train_iter is like (1,batch)
            features,targets,_,doc_lens = vocab.make_features(batch)
            features,targets = Variable(features), Variable(targets.float())
            if use_gpu:
                features = features.cuda()
                targets = targets.cuda()
            probs = net(features,doc_lens)
            loss = criterion(probs,targets)
#Criterions are helpful to train a neural network. Given an input and a target, they compute a gradient according to a given loss function
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(net.parameters(), args.max_norm)
            optimizer.step()
            if args.debug:
                print('Batch ID:%d Loss:%f' %(i,loss.data[0]))
                continue
            if i % args.report_every == 0:
                cur_loss = eval(net,vocab,val_iter,criterion)
                if cur_loss < min_loss:
                    min_loss = cur_loss
                    best_path = net.save()
                logging.info('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f'
                        % (epoch,min_loss,cur_loss))
    t2 = time()
    logging.info('Total Cost:%f h'%((t2-t1)/3600))

def test():

    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id)

    with open(args.test_dir, encoding="mbcs") as f:
        examples = [json.loads(line) for line in f]
    test_dataset = utils.Dataset(examples)

    test_iter = DataLoader(dataset=test_dataset,
                            batch_size=args.batch_size,
                            shuffle=False)
    if use_gpu:
        checkpoint = torch.load(args.load_dir) #loads the model
    else:
        checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

    # checkpoint['args']['device'] saves the device used as train time
    # if at test time, we are using a CPU, we must override device to None
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()

    doc_num = len(test_dataset)
    time_cost = 0
    file_id = 1
    for batch in tqdm(test_iter): #Instantly make your loops show a smart progress meter - just wrap any iterable with tqdm(iterable)
        features,_,summaries,doc_lens = vocab.make_features(batch)
        t1 = time()
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens) #here variable is what links the data with the model
        else:
            probs = net(Variable(features), doc_lens)
        t2 = time()
        time_cost += t2 - t1
        start = 0
        for doc_id,doc_len in enumerate(doc_lens):
            stop = start + doc_len
            prob = probs[start:stop]
            topk = min(args.topk,doc_len) #condidering 15 topk by default in args
            topk_indices = prob.topk(topk)[1].cpu().data.numpy()
            topk_indices.sort()
            doc = batch['doc'][doc_id].split('\n')[:doc_len]
            hyp = [doc[index] for index in topk_indices]
            ref = summaries[doc_id]
            with open(os.path.join(args.ref,str(file_id)+'.txt'), 'w') as f:
                f.write(ref)
            with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w', encoding="mbcs") as f:
                f.write('\n'.join(hyp))
            start = stop
            file_id = file_id + 1
    print('Speed: %.2f docs / s' % (doc_num / time_cost))


def predict(examples):
    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id)
    pred_dataset = utils.Dataset(examples)

    pred_iter = DataLoader(dataset=pred_dataset,
                            batch_size=args.batch_size,
                            shuffle=False)
    if use_gpu:
        checkpoint = torch.load(args.load_dir)
    else:
        checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

    # checkpoint['args']['device'] saves the device used as train time
    # if at test time, we are using a CPU, we must override device to None
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()

    doc_num = len(pred_dataset)
    time_cost = 0
    file_id = 1
    for batch in tqdm(pred_iter):
        features, doc_lens = vocab.make_predict_features(batch)
        t1 = time()
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens)
        else:
            probs = net(Variable(features), doc_lens)
        t2 = time()
        time_cost += t2 - t1
        start = 0
        for doc_id,doc_len in enumerate(doc_lens):
            stop = start + doc_len
            prob = probs[start:stop]
            topk = min(args.topk,doc_len)
            topk_indices = prob.topk(topk)[1].cpu().data.numpy()
            topk_indices.sort()
            doc = batch[doc_id].split('. ')[:doc_len]
            hyp = [doc[index] for index in topk_indices]
            with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w') as f:
                f.write('. '.join(hyp))
            start = stop
            file_id = file_id + 1
    print('Speed: %.2f docs / s' % (doc_num / time_cost))

if __name__=='__main__':
    if args.test:
        test()
    elif args.predict:
        with open(args.filename) as file:
            bod = [file.read()]
        predict(bod)
    else:
        train()

# Outputs folder

# Rouge_score.py

In [None]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
avg_precision1,avg_recall1,avg_fscore1=0,0,0
avg_precision2,avg_recall2,avg_fscore2=0,0,0
avg_precisionl,avg_recalll,avg_fscorel=0,0,0
tot=300
for i in range(1,tot+1):
    try:
        sys_gen = ''.join(open('./hyp/'+str(i)+'.txt').readlines())
        text = ''.join(open('./ref/'+str(i)+'.txt').readlines())
#         [precision, recall, f_score] = r.rouge_l([sys_gen], [text])
#         avg_precision+=precision
#         avg_fscore+=f_score
#         avg_recall+=recall
        score=scorer.score(sys_gen,text)
        avg_precision1+=score['rouge1'].precision
        avg_precision2+=score['rouge2'].precision
        avg_precisionl+=score['rougeL'].precision

        avg_recall1+=score['rouge1'].recall
        avg_recall2+=score['rouge2'].recall
        avg_recalll+=score['rougeL'].recall

        avg_fscore1+=score['rouge1'].fmeasure
        avg_fscore2+=score['rouge2'].fmeasure
        avg_fscorel+=score['rougeL'].fmeasure
#         scrs.append(scorer.score(sys_gen,text))
    except:
        tot-=1
print(avg_precision1/tot,avg_recall1/tot,avg_fscore1/tot)
print(avg_precision2/tot,avg_recall2/tot,avg_fscore2/tot)
print(avg_precisionl/tot,avg_recalll/tot,avg_fscorel/tot)

In [None]:
arr=torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]))
print(arr.size(1))