In [1]:
import sentencepiece
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import glob
import itertools
import json
import spacy
import pytextrank
import en_core_web_sm
import re
import sys
from deepmultilingualpunctuation import PunctuationModel
import time

In [2]:
#!python -m spacy download en_core_web_md 

In [3]:
# punctuation model
model_p = PunctuationModel()



In [4]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [5]:
def get_filenames(files):
    """ returns file names in the folder"""
    file_names = []
    for file in files:
        file_names.append(glob.glob(file))
    file_names = list(itertools.chain.from_iterable(file_names))
    return file_names

def make_summary_f(text, max_length):
    """ use the facebook model to get a summary"""
    sum_list = summarizer(text, min_length=20, max_length = max_length, do_sample=False)
    string = next(iter(sum_list[0].items()))[1]
    return string

def punct_model(text):
    """punctuate the text"""
    return(model_p.restore_punctuation(text))

def text_rank(text):
    """ use text rank to rank the phrases"""
    nlp = spacy.load('en_core_web_md')
    nlp.add_pipe('textrank', last=True)
    doc = nlp(text)
    str_arr = []
    for string in doc._.textrank.summary(limit_phrases=5, limit_sentences=1):
        str_arr.append(string)
    return str_arr[0].text

def model(text, run_text_rank, fb_length):
    """ run_text_rank is a bool, true if text rank should be
        used in the model. Set length for make_summary_f"""
    if run_text_rank:
        temp =  text_rank(text)
        return make_summary_f(temp,fb_length)
    
    return make_summary_f(text, fb_length)

def short_min(text):
    """
    cut text to first punctuation mark
    """
    symbols = [".",";"]
    arr = []
    i = 0
    counter = 0
    for char in text:
        if char in symbols:
            counter+=1
            arr.append((char, i))
        i+=1
        
    min_val = sys.maxsize
    for tup in arr:
        if min_val>tup[1]:
            min_val = tup[1]
    
    #if you have a good sentence of the right size return it
    return text[0:min_val]

def check_str(text,l,u):
    """ give lower and upper bounds and return true or false
        if the text is within the range"""
    if text == 0:
        return False
    elif len(text)<=u and len(text)>=l:
        return True
    else:
        return False

def shorten_string(text):
    """
    cut text to closest punctuation mark to 80 characters
    """
    symbols = [".",";"]
    arr = []
    i = 0
    counter = 0
    #get all locations of the punctuations
    for char in text:
        if char in symbols:
            counter+=1
            arr.append((char, i))
        i+=1
        
    min_val = sys.maxsize
    for tup in arr:
        if min_val>tup[1]:
            min_val = tup[1]
    
    i = 0
    up_bound = 0
    low_bound = 0
    # get punctuations closest to character 90
    while i<len(arr):
        if up_bound !=0 and low_bound !=0:
            break
        elif arr[i][1]>=90:
            up_bound = arr[i][1]
            low_bound = arr[i-1][1]
        i+=1
    #print(arr)
    #print(up_bound, low_bound)
    
    difu = abs(up_bound-90)
    difl = abs(low_bound-90)
    diful = up_bound-low_bound
    
    # if the upper and lower bounds are close
    if up_bound !=0:
        #shorten to upper bound if the difference between them is small
        # or the upper bound is smaller than the lower bound or the lower bound
        # is 35 or less
        if diful<=5 or difu<=difl or difl<35:
            short_string = text[0:up_bound]
            return short_string
        elif difu>=difl:
            short_string = text[0:low_bound]
            return short_string
    
    return text[0:min_val]
    
def check_stops(text):
    """return how many full stops in the text"""
    symbols = ["."]
    counter = 0
    for char in text:
        if char == ".":
            counter+=1
    return counter

def similar(len1, len2):
    # find out if 2 texts are similar in length
        if abs(len1-len2)<=3:
            return True
        return False

def cut(a, l, u, grp):
    # check if the model gives a suitable summary
    #if check_str(a,l,u):
    #print(grp + "1: " + a)
    
    # shorten the text to the first punctuation and see if it meets the criteria    
    temp_f = short_min(a)
    #if check_str(temp_f,l,u):
    #print(grp + "2: " +temp_f)

    # shorten the text to the punctutation mark closest to 90 characters and see if it
    # meets the criteria
    temp_e = shorten_string(a)
    #if check_str(temp_e,l,u):
    #print(grp + "3: " +temp_e)

    # add punctuation to the SHORTENED TEXT to FIRST punctuation. 
    # shorten it again to the FIRST punctuation and check if it meets the criteria
    punct_temp_f = punct_model(temp_f)
    temp_fpf = short_min(punct_temp_f)
    #if check_str(temp_fpf,l,u):
    #print(grp + "4: " +temp_fpf)

    # add punctuation to the SHORTENED TEXT to FIRST punctuation.
    # shorten it again to the 90TH punctuation and check if it meets the criteria
    temp_fpe = shorten_string(punct_temp_f)
    #if check_str(temp_fpe,l,u):
    #print(grp + "5: " +temp_fpe)

    # add punctuation to the SHORTENED TEXT to 90TH punctuation.
    # shorten it again to the FIRST punctuation and check if it meets the criteria
    punct_temp_e = punct_model(temp_e)
    temp_epf = short_min(punct_temp_e)
    #if check_str(temp_epf,l,u):
    #print(grp + "6: " +temp_epf)

    # add punctuation to the SHORTHENED TEXT to 90TH punctuation.
    # shorten it again to the 80TH punctuation and check if it meets the criteria
    temp_epe = shorten_string(punct_temp_e)
    #if check_str(temp_epe,l,u):
    #print(grp + "7: " +temp_epe)

    # add punctuaction to the TEXT and shorten it to the FIRST punctuation
    punct_temp_t = punct_model(a)
    temp_tpf = short_min(punct_temp_t)
    #if check_str(temp_tpf,l,u):
    #print(grp + "8: " +temp_tpf)

    # add punctuation to the TEXT and shorten it to the 90TH punctuation
    temp_tpe = shorten_string(punct_temp_t)
    #if check_str(temp_tpe,l,u):
    #print(grp + "9: " +temp_tpe)
    
    summaries = []
    lengths = []
    summaries.extend([a, temp_f, temp_e, temp_fpf, temp_fpe, temp_epf, temp_epe, temp_tpf, temp_tpe])
    lengths.extend([len(a), len(temp_f), len(temp_e), len(temp_fpf), len(temp_fpe), len(temp_epf)
                    , len(temp_epe), len(temp_tpf), len(temp_tpe)])
    
    summaries.sort()
    lengths.sort()
    i = len(summaries)-1
    while i>0:
        if not similar(lengths[i],lengths[8]) and check_str(summaries[i],l,u):
            #print(i)
            return str(grp + ": " + summaries[i])
        elif not similar(lengths[i],lengths[8]) or check_str(summaries[i],l,u):
            #print(i)
            return str(grp + ": " + summaries[i])
        i-=1
    
    return str(grp + ": " + summaries[8])

def check_length(text, length):
    """keep input string length between 300-400 words"""
    return text[0:length]
    

In [6]:
start = time.time()
path = ["Json files/*.json"]
json_files = get_filenames(path)
i=0
length = 2500
desc = "PROBLEM DESCRIPTION "
target = "TARGET CONDITION "
current = "CURRENT CONDITION "
root = "ROOT CAUSE ANALYSIS "
counter = "COUNTERMEASURES "
effect = "EFFECT CONFIRMATION "
follow = "FOLLOW UP ACTION "

#print(len(json_files))
with open('jsons4.0.txt', 'a') as g:
    while i < len(json_files):
        f = open(json_files[i])
        dicts = json.load(f)
        for vals in dicts.values():
            for val in vals.items():
                l = 40
                u = 150
                if val[0] == "PROBLEM DESCRIPTION":
                    desc += val[1]
                elif val[0] == "TARGET CONDITION":
                    target+=val[1]
                elif val[0] == "CURRENT CONDITION":
                    current += val[1]
                elif val[0] == "ROOT CAUSE ANALYSIS":
                    root+=val[1]
                elif val[0] == "COUNTERMEASURES":
                    counter +=val[1]
                elif val[0] == "EFFECT CONFIRMATION":
                    effect+=val[1]
                elif val[0] == "FOLLOW UP ACTION":
                    follow+=val[1]
                    #while len(string) < length:
                    #    print(len(string))
                    #    string+=val[0]+ ": "+val[1]
                    #g.write("\n")
                    #a = model(val[1], False, 53)
                    #g.write(cut(a,l,u,val[0]))
        i+=1
print(desc)
f.close()
end = time.time()
print("The time of execution of above program is :",
      (end-start))

PROBLEM DESCRIPTION WO# K150-08, Deka 10 TP1008 DV dynamic lean shift failure observed In RSG E-10 test bench mechanical durability per S1297 Injectors were manufactured in Prototype Services May 29, 2019. Configuration: 12.55mm OD,Extra Short, 10mm Tip, Schaleger MPG, NPN ATB, NPN LSSBMW complained first issues 07/2018 (19.07.) @plant Dingolfing (Hr. Habold MUC). There're no specific OBD failure code. The failure appears after first SCR system initiation (F1_Test). Failed at test step: System leak test. Service 31 ID 0x301 Detailed failure description see attachment (EINFÜGEN!).1) What vehicle CN7/CN7a 2) Product SIM3K-541, SW version 6VA600 3) Who issued HMMA QC 4) How HMMA QC reported that '5' instead of 'D' was displayed for the select lever switch information via UDS $22 in Roll &Brake process of CN7 vehicle. And compared to the project Nu ATK CVT, different value was displayed for D range. 5) Containment action 9,124EA to be reprogrammed in HMMA plantInjectors mechanically inoper

In [None]:
start = time.time()
path = ["Json files/*.json"]
json_files = get_filenames(path)
i=0
length = 2500