In [43]:
import sentencepiece
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, BartTokenizer, BartForConditionalGeneration
import glob
import itertools
import json
import spacy
import pytextrank
import en_core_web_sm
import re
import sys
from deepmultilingualpunctuation import PunctuationModel
import time
import numpy as np
import textwrap
import pandas as pd


In [2]:
#!python -m spacy download en_core_web_md 

In [33]:
# punctuation model
model_p = PunctuationModel()



In [34]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
condi_gen = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [52]:
def get_filenames(files):
    """ returns file names in the folder"""
    file_names = []
    for file in files:
        file_names.append(glob.glob(file))
    file_names = list(itertools.chain.from_iterable(file_names))
    return file_names

def make_summary_f(text, max_length, min_length):
    """ use the facebook model to get a summary"""
    sum_list = summarizer(text, min_length=min_length, max_length = max_length, do_sample=False)
    string = next(iter(sum_list[0].items()))[1]
    
    return string

def punct_model(text):
    """punctuate the text"""
    return(model_p.restore_punctuation(text))

def text_rank(text):
    """ use text rank to rank the phrases"""
    nlp = spacy.load('en_core_web_md')
    nlp.add_pipe('textrank', last=True)
    doc = nlp(text)
    str_arr = []
    for string in doc._.textrank.summary(limit_phrases=5, limit_sentences=1):
        str_arr.append(string)
    return str_arr[0].text

def model(text, run_text_rank, fb_length, min_len):
    """ run_text_rank is a bool, true if text rank should be
        used in the model. Set length for make_summary_f"""
    if len(text)<150:
        return text
    if run_text_rank:
        temp =  text_rank(text)
        return make_summary_f(temp,fb_length, min_len)
    
    return make_summary_f(text, fb_length, min_len)

def short_min(text):
    """
    cut text to first punctuation mark
    """
    symbols = [".",";"]
    arr = []
    i = 0
    counter = 0
    for char in text:
        if char in symbols:
            counter+=1
            arr.append((char, i))
        i+=1
        
    min_val = sys.maxsize
    for tup in arr:
        if min_val>tup[1]:
            min_val = tup[1]
    
    #if you have a good sentence of the right size return it
    return text[0:min_val]

def check_str(text,l,u):
    """ give lower and upper bounds and return true or false
        if the text is within the range"""
    if text == 0:
        return False
    elif len(text)<=u and len(text)>=l:
        return True
    else:
        return False

def shorten_string(text):
    """
    cut text to closest punctuation mark to 80 characters
    """
    symbols = [".",";"]
    arr = []
    i = 0
    counter = 0
    #get all locations of the punctuations
    for char in text:
        if char in symbols:
            counter+=1
            arr.append((char, i))
        i+=1
        
    min_val = sys.maxsize
    for tup in arr:
        if min_val>tup[1]:
            min_val = tup[1]
    
    i = 0
    up_bound = 0
    low_bound = 0
    # get punctuations closest to character 90
    while i<len(arr):
        if up_bound !=0 and low_bound !=0:
            break
        elif arr[i][1]>=90:
            up_bound = arr[i][1]
            low_bound = arr[i-1][1]
        i+=1
    #print(arr)
    #print(up_bound, low_bound)
    
    difu = abs(up_bound-90)
    difl = abs(low_bound-90)
    diful = up_bound-low_bound
    
    # if the upper and lower bounds are close
    if up_bound !=0:
        #shorten to upper bound if the difference between them is small
        # or the upper bound is smaller than the lower bound or the lower bound
        # is 35 or less
        if diful<=5 or difu<=difl or difl<35:
            short_string = text[0:up_bound]
            return short_string
        elif difu>=difl:
            short_string = text[0:low_bound]
            return short_string
    
    return text[0:min_val]
    
def check_stops(text):
    """return how many full stops in the text"""
    symbols = ["."]
    counter = 0
    for char in text:
        if char == ".":
            counter+=1
    return counter

def similar(len1, len2):
    # find out if 2 texts are similar in length
        if abs(len1-len2)<=3:
            return True
        return False

def cut(a, l, u, grp):
    if len(a)<150:
        return str(a)
    # check if the model gives a suitable summary
    
    # shorten the text to the first punctuation and see if it meets the criteria    
    temp_f = short_min(a)

    # shorten the text to the punctutation mark closest to 80 characters and see if it
    # meets the criteria
    temp_e = shorten_string(a)
    
    # add punctuation to the SHORTENED TEXT to FIRST punctuation. 
    # shorten it again to the FIRST punctuation and check if it meets the criteria
    punct_temp_f = punct_model(temp_f)
    temp_fpf = short_min(punct_temp_f)

    # add punctuation to the SHORTENED TEXT to FIRST punctuation.
    # shorten it again to the 80TH punctuation and check if it meets the criteria
    temp_fpe = shorten_string(punct_temp_f)

    # add punctuation to the SHORTENED TEXT to 80TH punctuation.
    # shorten it again to the FIRST punctuation and check if it meets the criteria
    punct_temp_e = punct_model(temp_e)
    temp_epf = short_min(punct_temp_e)

    # add punctuation to the SHORTHENED TEXT to 80TH punctuation.
    # shorten it again to the 80TH punctuation and check if it meets the criteria
    temp_epe = shorten_string(punct_temp_e)

    # add punctuaction to the TEXT and shorten it to the FIRST punctuation
    punct_temp_t = punct_model(a)
    temp_tpf = short_min(punct_temp_t)

    # add punctuation to the TEXT and shorten it to the 80TH punctuation
    temp_tpe = shorten_string(punct_temp_t)

    summaries = []
    lengths = []
    summaries.extend([a, temp_f, temp_e, temp_fpf, temp_fpe, temp_epf, temp_epe, temp_tpf, temp_tpe])
    lengths.extend([len(a), len(temp_f), len(temp_e), len(temp_fpf), len(temp_fpe), len(temp_epf)
                    , len(temp_epe), len(temp_tpf), len(temp_tpe)])
    
    summaries.sort()
    lengths.sort()
    i = len(summaries)-1
    while i>0:
        if not similar(lengths[i],lengths[8]) and check_str(summaries[i],l,u):
            return str(summaries[i])
        elif not similar(lengths[i],lengths[8]) or check_str(summaries[i],l,u):
            return str(summaries[i])
        i-=1
    
    return str(summaries[8])

def loc_full(text):
    """ 
    Get all locations of full stops
    """
    symbols = ["."]
    arr = []
    i = 0
    counter = 0
    #get all locations of the punctuations
    for char in text:
        if char in symbols:
            counter+=1
            arr.append((char, i))
        i+=1
    return arr

def split_paragraph(text):
    """
    input a text and split it up into num_para paragraphs. num_para should
    be more than one since the default of the model is 1 paragraph
    """

    locations = loc_full(text)
    num_para = len(locations)
    if num_para<2:
        return [text]
    new_string = ""
    j = 0
    arr = []
    while j<num_para:
        new_string = str(text[0:locations[j][1]] +  " \n" + text[locations[j][1]:])
        arr.append(new_string)
        #print(new_string)
        #print(locations)
        j+=1
    return arr

def new_model(text):
    input_tokens = tokenizer.batch_encode_plus([text], return_tensors = "pt", max_length = 1024, truncation =True)["input_ids"]
    num_token = input_tokens.shape[1]
    min_ = int(0.3*num_token)
    max_ = int(0.7*num_token)

    encoded_ids = condi_gen.generate(input_tokens, max_length = max_, min_length = min_, num_beams = 4,
                                early_stopping = True)
    summary = tokenizer.decode(encoded_ids.squeeze(), skip_special_tokens = True)
    return textwrap.fill(summary,max_)
    
    

In [52]:
path = ["Json files/*.json"]
json_files = get_filenames(path)
i=0

desc = "PROBLEM DESCRIPTION "
target = "TARGET CONDITION "
current = "CURRENT CONDITION "
root = "ROOT CAUSE ANALYSIS "
counter = "COUNTERMEASURES "
effect = "EFFECT CONFIRMATION "
follow = "FOLLOW UP ACTION "
long_strs = []
#print(len(json_files))
while i < len(json_files):
    f = open(json_files[i])
    dicts = json.load(f)
    for vals in dicts.values():
        for val in vals.items():
            l = 40
            u = 150
            if val[0] == "PROBLEM DESCRIPTION":
                desc += val[1]
            elif val[0] == "TARGET CONDITION":
                target+=val[1]
            elif val[0] == "CURRENT CONDITION":
                current += val[1]
            elif val[0] == "ROOT CAUSE ANALYSIS":
                root+=val[1]
            elif val[0] == "COUNTERMEASURES":
                counter +=val[1]
            elif val[0] == "EFFECT CONFIRMATION":
                effect+=val[1]
            elif val[0] == "FOLLOW UP ACTION":
                follow+=val[1]
                #g.write("\n")
                #a = model(val[1], False, 53)
                #g.write(cut(a,l,u,val[0]))
    i+=1
#print(desc)

In [42]:
long_strs = np.array([desc, target, current, root, counter, effect, follow])
start = time.time()

with open('jsons.txt', 'a') as g:
    for string in long_strs:
        print(string)
        l = 40
        u = 150
        max_len = int(0.8*len(string.split(" ")))
        min_len = int(0.3*len(string.split(" ")))
        a = model(string, False,max_len, min_len)
        g.write(cut(a,l,u))
        g.write("\n")
f.close()
end = time.time()
print("The time of execution of above program is :",
      (end-start))

PROBLEM DESCRIPTION WO# K150-08, Deka 10 TP1008 DV dynamic lean shift failure observed In RSG E-10 test bench mechanical durability per S1297 Injectors were manufactured in Prototype Services May 29, 2019. Configuration: 12.55mm OD,Extra Short, 10mm Tip, Schaleger MPG, NPN ATB, NPN LSSBMW complained first issues 07/2018 (19.07.) @plant Dingolfing (Hr. Habold MUC). There're no specific OBD failure code. The failure appears after first SCR system initiation (F1_Test). Failed at test step: System leak test. Service 31 ID 0x301 Detailed failure description see attachment (EINFÜGEN!).1) What vehicle CN7/CN7a 2) Product SIM3K-541, SW version 6VA600 3) Who issued HMMA QC 4) How HMMA QC reported that '5' instead of 'D' was displayed for the select lever switch information via UDS $22 in Roll &Brake process of CN7 vehicle. And compared to the project Nu ATK CVT, different value was displayed for D range. 5) Containment action 9,124EA to be reprogrammed in HMMA plantInjectors mechanically inoper

IndexError: index out of range in self

In [67]:
df = pd.DataFrame(columns = ["input", "output", "output_length", "input_length", "model_name", "time"])

In [70]:

path = ["Json files/*.json"]
json_files = get_filenames(path)
i=0
with open('jsons2.txt', 'a') as g:
    while i < len(json_files):
        f = open(json_files[i])
        dicts = json.load(f)
        for vals in dicts.values():
            for val in vals.items():
                l = 40
                u = 150
                if val[0] == "PROBLEM DESCRIPTION" or val[0] == "TARGET CONDITION" or val[0] == "CURRENT CONDITION" or val[0] == "ROOT CAUSE ANALYSIS" or val[0] == "COUNTERMEASURES" or val[0] == "EFFECT CONFIRMATION" or val[0] == "FOLLOW UP ACTION":
                    start = time.time()
                    max_len = int(0.8*len(val[1].split(" ")))
                    min_len = int(0.3*len(val[1].split(" ")))
                    #g.write("\ninputs\n")
                    #g.write("\n"+val[1]+"\n")
                    #g.write(val[0])
                    #print(val[0],val[1])
                    #g.write("\n")
                    #split_string = split_paragraph(val[1])
                    a = new_model(val[1])
                    #g.write("\noutputs\n")
                    #g.write(a)
                    #g.write("\n")
                    arr = []
                    #for j in split_string:
                        #b = model(j,False,max_len, min_len)
                        
                        #b = new_model(j)
                        #g.write("\noutputs\n")
                        #g.write(b)
                        #a = str(val[0] + ": " + b)
                    if len(b)<5:
                        #arr.append(val[1])
                        g.write(val[1])
                        #else:
                            #g.write(a)
                            #c = cut(b,l,u,val[0])
                            #arr.append(c)
                            #g.write(cut(b,l,u,val[0]))
                        #g.write("\n")
                    end =  time.time()
                    df.loc[len(df)] = [val[1], a, len(a), len(val[1]), "uncut_unsplit_80_false_4", end-start]
                            
        i+=1

Input length of decoder_input_ids is 1, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


RuntimeError: The expanded size of the tensor (1) must match the existing size (2) at non-singleton dimension 0.  Target sizes: [1].  Tensor sizes: [2]

In [71]:
df

Unnamed: 0,input,output,output_length,input_length,model_name,time
0,"WO# K150-08, Deka 10 TP1008 DV dynamic lean sh...","WO# K150-08, Deka 10 TP1008 DV dynamic lean sh...",90,266,cut_split_dynamic_false,4.753284
1,Specification (+5% / -3%) 200M Audit: 47362 = ...,"200M Audit: 47362 = -10.1%, 47371 = -6.2%, 47",45,189,cut_split_dynamic_false,9.356972
2,Twenty four of twenty-four injectors successfu...,Twenty four of twenty-four injectors successfu...,109,109,cut_split_dynamic_false,0.000997
3,Reference 8D ES 191216231144 for detailed anal...,RCH2 - Deka 10. design and process variation i...,145,1061,cut_split_dynamic_false,5.075427
4,"WO# K150-08, Deka 10 TP1008 DV dynamic lean sh...","WO# K150-08, Deka 10 TP1008 DV dynamic lean sh...",204,266,uncut_unsplit_80_false_4,4.080085
5,Specification (+5% / -3%) 200M Audit: 47362 = ...,Specification (+5% / -3%) 200M Audit: 47362 = ...,108,189,uncut_unsplit_80_false_4,4.110008
6,Twenty four of twenty-four injectors successfu...,Twenty four of\ntwenty-four\ninjectors\nsucces...,91,109,uncut_unsplit_80_false_4,1.311477
7,Reference 8D ES 191216231144 for detailed anal...,RCH2 - Deka 10 design and process variation in...,352,1061,uncut_unsplit_80_false_4,6.824746
8,1. Review of defined dynamic tolerance vs syst...,1. Review of defined dynamic tolerance vs syst...,194,544,uncut_unsplit_80_false_4,3.422844
9,AMC PV6 (L062-05) PV8 (L113-03) and PV9 (L209-...,AMC PV6 (L062-05) PV8 (L113-03) and PV9 (L209-...,179,348,uncut_unsplit_80_false_4,4.434141


In [50]:
df = pd.DataFrame(columns = ["input", "output", "output_length", "input_length", "model_name"])
df

Unnamed: 0,input,output,output_length,input_length,time_taken,model_name
