In [43]:
from transformers import BartTokenizer, BartForConditionalGeneration
import textwrap
import glob
import itertools
import json
import pandas as pd
import time
import numpy as np

In [44]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
condi_gen = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [45]:
class Summary():
    def __init__(self,text,l,u,beams,stop,skip):
        """
        l - lower bound of the string as a float below 1
        u - upper bound of the string as a float below 1 always greater than l
        beams -  number of beams as an int
        stop - true or false for early stopping
        skip - true or false for skipping special characters
        """
        self.text = text
        self.output = ""
        self.l = l
        self.u = u
        self.beams = beams
        self.stop = stop
        self.skip = skip
        self.name = str(self.l) +str("_")+ str(self.u) +str("_") + str(self.beams) +str("_") + str(self.stop) +str("_") + str(self.skip)
        
    def new_model(self):

        input_tokens = tokenizer.batch_encode_plus([self.text], return_tensors = "pt", max_length = 1024, truncation =True)["input_ids"]
        num_token = input_tokens.shape[1]
        min_ = int(self.l*num_token)
        max_ = int(self.u*num_token)

        encoded_ids = condi_gen.generate(input_tokens, max_length = max_, min_length = min_, num_beams = self.beams,
                                    early_stopping = self.stop)
        summary = tokenizer.decode(encoded_ids.squeeze(), skip_special_tokens = self.skip)
        self.output = textwrap.fill(summary,max_)
        return textwrap.fill(summary,max_)
    
    

def get_filenames(files):
    """ returns file names in the folder"""
    file_names = []
    for file in files:
        file_names.append(glob.glob(file))
    file_names = list(itertools.chain.from_iterable(file_names))
    return file_names

In [46]:
#only run this once
df = pd.DataFrame(columns = ["input", "output", "output_length", "input_length", "percentage_decrease", "model_name", "time"])

In [75]:
#keep running this to fill the dataframe with different hypyer-parameters
path = ["Json files/*.json"]
json_files = get_filenames(path)

i=0
with open('jsons2.txt', 'a') as g:
    while i < len(json_files):
        f = open(json_files[i])
        dicts = json.load(f)
        for vals in dicts.values():
            for val in vals.items():
                if val[0] == "PROBLEM DESCRIPTION" or val[0] == "TARGET CONDITION" or val[0] == "CURRENT CONDITION" or val[0] == "ROOT CAUSE ANALYSIS" or val[0] == "COUNTERMEASURES" or val[0] == "EFFECT CONFIRMATION" or val[0] == "FOLLOW UP ACTION":
                    start = time.time()
                    sum_ = Summary(val[1],0.3,0.5,2,False,True)
                    if len(val[1])<5:
                        continue
                    a = sum_.new_model()
                    #print(a)
                    
                    end =  time.time()
                    df.loc[len(df)] = [val[1], a, len(a), len(val[1]), (len(val[1])-len(a))/len(val[1]),  sum_.name, end-start]
        i+=1

In [78]:
df = df.sort_values(by=['input_length'])

In [79]:
df.to_csv('table.csv')