In [1]:
%load_ext autoreload
%autoreload 15

In [2]:
import signal
from contextlib import contextmanager

class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

In [3]:
import sys
import os
# !pip install fitz
# import comtypes.client
import argparse
import time

def wordToPdf(file1):
    wdFormatPDF = 17
    input_file = os.path.abspath(file1)
    output_file = os.path.splitext(input_file)[0]+".pdf"
    output_file = os.path.abspath(output_file)
    word = comtypes.client.CreateObject('Word.Application')
    word.Visible = True
    time.sleep(3)
    
    doc = word.Documents.Open(input_file)

    doc.SaveAs(output_file, FileFormat=wdFormatPDF)

    doc.Close()
    word.Quit()

    return output_file
   



In [4]:
import json
import re
from bs4 import BeautifulSoup
import os
import requests
from pprint import pprint
import uuid
from statistics import mode
from nltk.corpus import stopwords
from pdf2txt import convert_pdf
# !pip install tika
import tika
from definitions_v4 import *



In [5]:
pointsRE_heading=re.compile('(?:\s*\([a-z]{1,3}\)|[A-Z]{1}\s+[a-zA-Z0-9_\s]{5})')
stop_words = list(set(stopwords.words('english')))
y_cordinate_Re=re.compile('top:(\d{1,5})px')
height_re=re.compile('height:(\d{1,5})px')

In [6]:
def check_underline(positions,y_cord_text):
    positions=[i for i in positions if y_cord_text<=i< y_cord_text+16]
    if positions:
        return True
    else:
        return False
    

In [7]:
def get_pages_no(soup):
    total_items = int(soup.find('span', id='NewReleases_total').text)
    items_per_page = int(soup.find('span', id='NewReleases_end').text)
    return round(total_items/items_per_page)

In [8]:
from pdfminer.high_level import extract_pages

# print(len(list(extract_pages('test/Chubb16-250-1019 Chubb EBM Business Pack Product Disclosure Statement (PDS) and Policy Wording.pdf'))))

In [9]:
def get_underlines(soup):
    positions=[]
    for span in soup.find_all('span'):
        y_cord_list=y_cordinate_Re.findall(str(span))
        if y_cord_list:
            y_cord=int(y_cord_list[0])
        else:
            continue
        style="position:absolute; border: black 1px solid" in str(span)
        height_px_li=height_re.findall(str(span))
        if height_px_li:
            height_px=height_px_li[0]
        else:
            continue
        height=int(height_px)<15
        if all([style,height]):
            positions.append(y_cord)
    return list(set(positions))

In [10]:
def ext_extract(text_list,language):
    ext_flag=False
    if language=='english':
        ext_lis=['extension']
    else:
        ext_lis=['extensión','extensiones']
   
    ext_count=0
    position_list=[]
    position=0
    for text in text_list:
        # if any two of 'exclusion ,condition or extension is present in the text, that text is omitted from the logic'
        bool_list=['condition' in text.lower(),'exclusion' in text.lower(),'extension' in text.lower()]
        if sum(bool_list)>=2:
            continue
        for i in ext_lis:
            for match in re.finditer(i, text.lower()):
                position_list.append(position+match.start())               
                ext_count+=1
        position+=len(text)
    if ext_count>=1:
        ext_flag=True
    return(ext_flag,ext_count,position_list)

In [11]:
def excl_extract(text_list,language):
    exclusion_flag=False
    if language=='english':
        excl_lis=['exclud','not cover','except','does not mean','not includ','exclusion']
    else:
        excl_lis=['excepto','excepción','no in cluido','exclusión','excluidos','excluirlo','excluyendo','exclusiones','exclusion','excluyen','excluyentes']
   
    exclusion_count=0
    position_list=[]
    position=0
    for text in text_list:
        # if any two of 'exclusion ,condition or extension is present in the text, that text is omitted from the logic'
        bool_list=['condition' in text.lower(),'exclusion' in text.lower(),'extension' in text.lower()]
        if sum(bool_list)>=2:
            continue
        for i in excl_lis:
            for match in re.finditer(i, text.lower()):
                position_list.append(position+match.start())               
                exclusion_count+=1
        position+=len(text)
    if exclusion_count>=1:
        exclusion_flag=True
    return(exclusion_flag,exclusion_count,position_list)

In [12]:
def ext_check(text,language):
    if language=='english':
        ext_bag=['extension']
    else:
        ext_bag=['extensión','extensiones']
    for word in ext_bag:
        if word in text.lower():
            return True
    return False
    

In [13]:
def cond_extract(text_list,language):
    cond_flag=False
    if language=='english':
        cond_lis=['condition']
    else:
        cond_lis=['condición','condiciones','condicionado']
   
    cond_count=0
    position_list=[]
    position=0
    for text in text_list:
        # if any two of 'exclusion ,condition or extension is present in the text, that text is omitted from the logic'
        bool_list=['condition' in text.lower(),'exclusion' in text.lower(),'extension' in text.lower()]
        if sum(bool_list)>=2:
            continue
        for i in cond_lis:
            for match in re.finditer(i, text.lower()):
                position_list.append(position+match.start())               
                cond_count+=1
        position+=len(text)
    if cond_count>=1:
        cond_flag=True
    return(cond_flag,cond_count,position_list)

In [14]:
def font_extraction(soup):
    fontsizes=[]
    for divs in soup.findAll('div'):
        for j in divs.find_all('span'):
            ext_size=re.findall(r'font-size:(.*)px">',str(j))
            if ext_size:
                fontsizes.append(int(ext_size[0]))
    return(fontsizes)

In [15]:
def check_box(positions,y_cord_text):
    positions=[i for i in positions if y_cord_text-7 <=i< y_cord_text]
    if positions:
        return True
    else:
        return False

In [16]:
def get_underlines_n_boxes(soup):
    underline_positions=[]
    box_positions=[]
    for span in soup.find_all('span'):
        y_cord_list=y_cordinate_Re.findall(str(span))
       
        if y_cord_list:
            y_cord=int(y_cord_list[0])
        else:
            continue
        style="position:absolute; border: black 1px solid" in str(span)
        height_px_li=height_re.findall(str(span))
        if height_px_li:
            height_px=height_px_li[0]
        else:
            continue
        height=int(height_px)<15
        heigh_box=18<=int(height_px)<50
       
        if all([style,height]):
            underline_positions.append(y_cord)
        elif all([style,heigh_box]) :
            box_positions.append(y_cord)
#             print(height_px,'HEIGHT_BOX')
#             print()
#             print(y_cord,'YCHORDDDD')
           
    return list(set(underline_positions)),list(set(box_positions))

In [17]:
def stopword_check(word,text,language):
    if language=='spanish':
        stop_words = list(set(stopwords.words('spanish')))
    else:
        stop_words = list(set(stopwords.words('english')))
        transition="although  instead  whereas  despite  conversely  otherwise  however moreover  likewise  comparatively  correspondingly  similarly  furthermore  additionallyver  rather  nevertheless  nonetheless  regardless  notwithstanding consequently  therefore  thereupon  forthwith  accordingly  henceforth"
        transition_words=transition.split()
        transition_words
        stop_words.extend(transition_words)

    if exclusion_check(text,language):
        return True
    if word.lower() in stop_words:
        return False
    else:
        return True

In [18]:
def exclusion_check(text,language):
    if language=='spanish':
        exclusion_bag=['excepto','excepción','no in cluido','exclusión','excluidos','excluirlo','excluyendo',' excluyen','exclusiones','exclusiones','exclusion','excluyentes']
    else:
        exclusion_bag=['exclusion','excluded','not covered','will not cover','will not pay']
    for word in exclusion_bag:
        if word in text.lower():
            return True
    return False
    

In [19]:
def criteria_check(text,language):
    if language=='spanish':
        exclusion_bag=['condición','condiciones','condicionado']
    else:
        exclusion_bag=['condition','conditions']
    for word in exclusion_bag:
        if word in text.lower():
            return True
    return False
    

In [20]:
def ext_check(text,language):
    if language=='english':
        ext_bag=['extension']
    else:
        ext_bag=['extensión','extensiones']
    for word in ext_bag:
        if word in text.lower():
            return True
    return False
    

In [21]:
def endorsement_check(text,language):
    if language=='english':
        ext_bag=['additional terms and conditions','endorsement 1']
#     else:
#         ext_bag=['extensión','extensiones']
    for word in ext_bag:
        if word in text.lower():
            return True
    return False
    

In [22]:
def ins_agg_check(text,language):
    if language=='spanish':
        exclusion_bag=['excepto','excepción','no in cluido','exclusión','excluidos','excluirlo','excluyendo',' excluyen','exclusiones','exclusiones','exclusion','excluyentes']
    else:
        exclusion_bag=['appendix','contact us','about chubb','introduction','general information','tax audit','financial strength rating','fair insurance code','duty of disclosure','privacy statement','important information','important notices','finance act','insurance act','authorization and regulation','french prudential supervision and resolution authority','compensation scheme','financial services','product disclosure','insurer complaints procedure','dispute resolution','complaints procedure','exclusion','excluded','not covered','will not cover','will not pay','additional terms and conditions','condition','conditions','definition','extension','data protection']
    for word in exclusion_bag:
        if word in text.lower():
            return False
    return True
    

In [23]:
start=re.compile('.*\d{1,3}.*(?:\-|\.|–)')
def clean(text):
    text=text.replace('\n','').replace('\t',' ')
    text=start.sub('',text).strip()
    

    return(text)
def clean_pharses(phrases):
    phrases=[clean(i) for i in phrases if len(i)> 3]
    phrases=list(set(phrases))
    return phrases

In [24]:
def create_local_index(path,subdir,language,country):
    pointsRE_heading=re.compile('(?:\s*\([a-z]{1,3}\)|(?<![A-z])[A-Z]{1}\s+[a-zA-Z0-9_\s]{5})')
    pageNumRE=re.compile('Page\s*(\d{1,3})',re.IGNORECASE)
    neglect_def=['policy','insured','schedule']
    local_indexed={}
    previous_span=''
    bold_phrases_indexed={}
    for file in os.listdir(path):
        if file.endswith('.pdf'):
            print(file)
            try:
                pg_no=len(list(extract_pages(os.path.join(path,file))))
            except:
                pg_no=0
            try:
                html=convert_pdf(os.path.join(path,file),'html')
            except:
                continue
            soup = BeautifulSoup(html, 'html5lib')
            underline_positions,box_positions=get_underlines_n_boxes(soup)
            
            try:
                fontsizes=font_extraction(soup)
                file_font_size_mode=mode(fontsizes)
            except:
                file_font_size_mode=8
                
            text_para,text_plain_para='',''
            text_lis,text_plain_lis,bold_lis,page_list,=[],[],[],[]
            bold=False
            sub_page_def_list=[]
            try:
                definitions=def_extraction2(os.path.join(path,file))
            except Exception as e:
                print(e)
                continue
            definitions={key:value for key,value in definitions.items() if key.lower().strip() not in neglect_def}
            def_terms=list(definitions.keys())
            definition_text=''
            header_match_object=(0,'',False)
            cond_header_match_object=(0,'',False)
            ext_header_match_object=(0,'',False)
            endorsement_header_match_object=(0,'',False)
            ins_agg_header_match_object=(0,'',False)
            endorsement_y_cord=0
            second_category=False
            head_found=False
            def_flag=False
            span_text_pos=0
            header_pos=[]
            cond_head_found=False
            ext_head_found=False
            endorsement_head_found=False
            ins_agg_head_found=False
            single_page_head_found=False
            condition_text=''
            ext_text=''
            excl_text=''
            endorsement_text=''
            ins_agg_text=''
            previous_pg_num=[]
            def_in_page=[]
            cond_single_page_head_found=False
            ext_single_page_head_found=False
            endorsement_single_page_head_found=False
            ins_agg_single_page_head_found=False
            
            font_size=file_font_size_mode

            for divs in soup.findAll('div'):
                div_text_list=[span.text for span in divs.find_all('span') ]
                page_str=str(divs.find_all('a'))
                page_num_results=pageNumRE.findall(page_str)
                if page_num_results:
                    pagenum=page_num_results[0]

                    if pagenum!=previous_pg_num:
                        sub_definitions_in_page=[]
                    previous_pg_num=pagenum

                    if text_lis:
                        text_lis=text_lis[:-1]
                        text_para=''.join(text_lis)
                        text_plain_lis=text_plain_lis[:-1]
                        text_plain_para=''.join(text_plain_lis)
                        if(len(re.findall('\.',text_para))>(len(text_para)/3) or (len(re.findall('\d',text_para))>50 and 'gbp' not in text_para.lower() and 'content' in text_para.lower())): # and int(pagenum)<=2
                            print('PASS')
                            text_para,text_plain_para=' ',' '
                            text_lis,text_plain_lis=[],[]
                            single_page_head_found=False
                            head_found=False
                            span_text_pos=0
                            header_pos=[]
                            cond_head_found=False
                            ext_head_found=False
                            endorsement_head_found=False
                            ins_agg_head_found=False
                            continue
                        
                        else:
                            bold_lis=clean_pharses(bold_lis)
                            pagenum=int(pagenum)-1
                            excl_flag,excl_count,excl_pos_lis=excl_extract(text_plain_lis,language)
                            cond_flag,cond_count,cond_pos_lis=cond_extract(text_plain_lis,language)
                            ext_flag,ext_count,ext_pos_lis=ext_extract(text_plain_lis,language)
                            if int(pagenum)==1:
                                if head_found or single_page_head_found or excl_count>0:
                                    excl_flag=True
                                else:
                                    excl_flag=False
                            else:
                                if head_found or single_page_head_found:
                                    excl_flag=True

                            if cond_head_found or cond_single_page_head_found or cond_count>0:
                                cond_flag=True
                            else:
                                cond_flag=False
    
                            if ext_head_found or ext_single_page_head_found or ext_count>0:
                                ext_flag=True
                            else:
                                ext_flag=False
    
                            if endorsement_head_found or endorsement_single_page_head_found:
                                endorsement_flag=True
                                print(pagenum,endorsement_flag)
                            else:
                                endorsement_flag=False
                                print(pagenum,endorsement_flag)
                            
                            if ins_agg_head_found or ins_agg_single_page_head_found:
                                ins_agg_flag=True
#                                 print(pagenum,True)
                            else:
                                ins_agg_flag=False
#                                 print(pagenum,False)
                                
                            for term,defs in definitions.items():
                                if defs[1]==pagenum:  ###defs[1] is the pagenumber
                                    definition_text=definition_text+'          '+term+' '+defs[0]
                                    def_in_page.append({'name':term,'text':defs[0]})

                            if definition_text:
                                def_flag=True
                            
                            if def_flag:
                                def_search_flag=True
                            else:
                                def_search_flag=False   
                            
                            if pg_no>2:
                                endorsements=False
                            else:
                                endorsements=True
#                             print('PAGE',pagenum)
#                             print(endorsements,'ENDORSEMENTS!!!')
                            if text_para in local_indexed.keys():
                                text_para=text_para+' '
                            
                            local_indexed[text_para]=(file,pagenum,bold_lis,text_plain_para,excl_flag,excl_count,excl_pos_lis,subdir,sub_page_def_list,country,language,definition_text,def_search_flag,endorsements,excl_text,condition_text,cond_flag,cond_count,cond_pos_lis,def_in_page,ext_text,ext_flag,ext_count,ext_pos_lis,span_text_pos,header_pos,endorsement_flag,endorsement_text,ins_agg_flag,ins_agg_text)
#                             print(sub_page_def_list,'DEF LIST!!!!!1')
                            sub_page_def_list=[]
                            span_text_pos=0
                            header_pos=[]
                            page_list.append(pagenum)
                            text_para,text_plain_para='',''
                            text_lis,text_plain_lis,bold_lis=[],[],[]
                            definition_text=''
                            span_text_pos=0
                            cond_single_page_head_found=False
                            ext_single_page_head_found=False
                            endorsement_single_page_head_found=False
                            ins_agg_single_page_head_found=False
                            condition_text=''
                            ext_text=''
                            excl_text=''
                            ins_agg_text=''
                            endorsement_text=''
                            single_page_head_found=False
                            def_in_page=[]
                            def_flag=False
                for span in divs.find_all('span'):

                    bold=False
                    upper=False
                    bullet=False
                    def_flag=False
                    span_position=div_text_list.index(span.text)
#                     span_text_pos+=len(span.text)
# #                     print(span_text_pos,'SPAN POS')
                    if "Bold" in str(span) or 'CIDFont+F3' in str(span):
                        bold_lis.append(span.text)
                        bold=True
                    if span.text.isupper():
                        upper=True
                    font_family_match=re.findall(r"font-family: b'(.*)';",str(span))
                    if font_family_match:
                        font_family=font_family_match[0]
                    else:
                        font_family=''
#                     print(font_family)
                    font_size_match=re.findall(r'font-size:(.*)px">',str(span))
                    if font_size_match:
                        font_size=int(font_size_match[0])
                    if pointsRE_heading.findall(span.text):
                        bullet=True
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
                    y_cord_list=y_cordinate_Re.findall(str(divs))
                    if y_cord_list:
                        y_cord=int(y_cord_list[0])
#                     underlined_text=check_underline(underline_positions,y_cord)
                    underlined_text=check_underline(underline_positions,y_cord)
                    
                    boxed_text=check_box(box_positions,y_cord)
#                     if 'Endorsement 1' in span.text:
#                         print(box_positions,y_cord,'BOX POS AND Y')
                    if span.text.split('\n')[0]!='' and span.text.split('\n')[0]!=' ':
                        head_check_text=span.text.split('\n')[0]
                    elif len(span.text.split('\n'))>1:
                        head_check_text=span.text.split('\n')[1]
                    else:
                        head_check_text=''
##############################3  
                    if str(previous_span).endswith('<br/></span>'):
                        span_position=0
                    ##################INSERT FUNCTION####################z
                    #EXCLUSION 
                    if  span_position==0 and (bold or font_size>= file_font_size_mode +2 ) and (font_size> file_font_size_mode or upper) and 5<len(head_check_text.strip())<80:                    
                        header_pos.append(span_text_pos)
#                         print('HEADER FOUND', span.text,span_text_pos)
                        if exclusion_check(head_check_text,language) and not head_found: 
                            head_found=True
                            header_match_object=(font_size,font_family,bold,upper,bullet)
                            First_category=True
                            second_category=False
                            single_page_head_found=True
#                             print('FOUND Heading for exclusion ...',span.text)
          
    
                        elif ((font_size,font_family,bold,upper,bullet)==header_match_object  or font_size>header_match_object[0] ) and head_found and not exclusion_check(head_check_text,language):
                            head_found=False
                            single_page_head_found=True
#                             print('FOUND Closure for exclusion ...',span.text)
#                     if 'CLAIMS CONDITIONS' in span.text:
#                         print ('^^^^^^^',(font_size,font_family,bold,upper,bullet),header_match_object, font_size>header_match_object[0] , head_found , not exclusion_check(head_check_text,language))
#                         print(pointsRE_heading.findall(span.text))
                    words_title=[word.istitle() for word in head_check_text.split() if stopword_check(word,head_check_text,language) and not word.isdigit() ]
                    if span_position==0 and all(words_title) and head_check_text.strip() not in definitions and  (len(words_title) >1 or underlined_text )and 5<len(head_check_text.strip())<80 :
#                         print('MMMMMMMMARARARA',span.text)
                        header_pos.append(span_text_pos)
#                         print('HEADER 2 FOUND', span.text,span_text_pos)
                        if exclusion_check(head_check_text,language) and  not head_found:
                            head_found=True
                            second_category=True
                            First_category=False
                            single_page_head_found=True
                            header_match_object=(font_size,font_family,bold,upper,bullet)
#                             print("FOUND Exclusion heading type2 ....",span.text)
                        elif head_found and not exclusion_check(head_check_text,language) and second_category and ((font_size,font_family,bold,upper,bullet)==header_match_object or font_size > header_match_object[0] ) :
                            head_found=False
                            second_category=False
                            single_page_head_found=True
#                             print('FOUND Exclusion closure type 2',span.text)
                    if head_found:
                        excl_text=excl_text+' '+span.text     
                     
                    #CONDITIONS
                    if  span_position==0 and (bold or font_size>= file_font_size_mode +2 ) and (font_size> file_font_size_mode or upper) and 5<len(head_check_text.strip())<80:                    
                        if criteria_check(head_check_text,language) and not cond_head_found:
#                             print(int(pagenum),'COND PAGE')
#                             print(span_position==0,(bold or font_size>= file_font_size_mode +2 ), (font_size> file_font_size_mode or upper), 5<len(head_check_text.strip())<80)
                            cond_head_found=True
                            cond_header_match_object=(font_size,font_family,bold,upper,bullet)
                            cond_First_category=True
                            cond_second_category=False
                            cond_single_page_head_found=True
#                             print('FOUND Heading for condition...',span.text)
                            
                        elif ((font_size,font_family,bold,upper,bullet)==cond_header_match_object or font_size>cond_header_match_object[0] ) and not criteria_check(head_check_text,language)  and cond_head_found:
                            cond_head_found=False
                            cond_single_page_head_found=True

                    cond_words_title=[word.istitle() for word in head_check_text.split() if stopword_check(word,head_check_text,language) and not word.isdigit() ]
                    if span_position==0 and all(cond_words_title) and head_check_text.strip() not in definitions and  (len(words_title) >1 or underlined_text )and 5<len(head_check_text.strip())<80 :
#                         print('MMMMMMMMARARARA',span.text)

                        if criteria_check(head_check_text,language) and  not cond_head_found:
                            cond_head_found=True
                            cond_second_category=True
                            cond_First_category=False
                            cond_single_page_head_found=True
                            cond_header_match_object=(font_size,font_family,bold,upper,bullet)
#                             print("FOUND Condition heading type2 ....",span.text)
                        elif cond_head_found and not criteria_check(head_check_text,language) and cond_second_category and ((font_size,font_family,bold,upper,bullet)==cond_header_match_object or font_size > cond_header_match_object[0] ) :
                            cond_head_found=False
                            cond_second_category=False
                            cond_single_page_head_found=True
#                             print('FOUND Condition closure type 2',span.text)

                    if cond_head_found==True:
                        condition_text=condition_text+' '+span.text
                            
                    #EXTENSIONS
                    if  span_position==0 and (bold or font_size>= file_font_size_mode +2 ) and (font_size> file_font_size_mode or upper) and 5<len(head_check_text.strip())<80:                    
                        if ext_check(head_check_text,language) and not ext_head_found: 
                            ext_head_found=True
                            ext_header_match_object=(font_size,font_family,bold,upper,bullet)
                            ext_First_category=True
                            ext_second_category=False
                            ext_single_page_head_found=True
#                             print('FOUND Extension Heading for ext...',span.text)
                    
                        elif ((font_size,font_family,bold,upper,bullet)==ext_header_match_object or font_size>ext_header_match_object[0] ) and not ext_check(head_check_text,language)  and ext_head_found:
                            ext_head_found=False
                            ext_single_page_head_found=True
#                             print('FOUND Extension Closure for ext ...',span.text)
                
                    
                    ext_words_title=[word.istitle() for word in head_check_text.split() if stopword_check(word,head_check_text,language) and not word.isdigit() ]
                    if span_position==0 and all(ext_words_title) and head_check_text.strip() not in definitions and  (len(words_title) >1 or underlined_text )and 5<len(head_check_text.strip())<80 :
#                         print('MMMMMMMMARARARA',span.text)

                        if ext_check(head_check_text,language) and  not ext_head_found:
                            ext_head_found=True
                            ext_second_category=True
                            ext_First_category=False
                            ext_single_page_head_found=True
                            ext_header_match_object=(font_size,font_family,bold,upper,bullet)
#                             print("FOUND Extension heading type2 ext....",span.text)
                        elif ext_head_found and not ext_check(head_check_text,language) and ext_second_category and ((font_size,font_family,bold,upper,bullet)==ext_header_match_object or font_size > ext_header_match_object[0] ) :
                            ext_head_found=False
                            ext_second_category=False
                            ext_single_page_head_found=True
#                             print('FOUND Extension closure type 2 ext',span.text)
# 
                    if ext_head_found==True:
                        ext_text=ext_text+' '+span.text
                    
                #ENDORSEMENTS
                    if  span_position==0 and (bold or font_size>= file_font_size_mode +2 ) and (font_size> file_font_size_mode or upper) and 5<len(head_check_text.strip())<80:                    
                        if endorsement_check(head_check_text,language) and not endorsement_head_found: 
                            endorsement_head_found=True
                            endorsement_header_match_object=(font_size,font_family,bold,upper,bullet)
                            endorsement_First_category=True
                            endorsement_second_category=False
                            endorsement_single_page_head_found=True
                            print('FOUND Endorsement Heading for ext...',span.text)
                    
                        elif ((font_size,font_family,bold,upper,bullet)==endorsement_header_match_object or font_size>endorsement_header_match_object[0] ) and not endorsement_check(head_check_text,language)  and endorsement_head_found:
                            endorsement_head_found=False
                            endorsement_single_page_head_found=True
                            print('FOUND Endorsement Closure for ext ...',span.text)
                
                    
                    endorsement_words_title=[word.istitle() for word in head_check_text.split() if stopword_check(word,head_check_text,language) and not word.isdigit() ]
                    if span_position==0 and head_check_text.strip() not in definitions and  (len(words_title) >1 or boxed_text) and 5<len(head_check_text.strip())<80 :
#                         print('MMMMMMMMARARARA',span.text)
                        if endorsement_check(head_check_text,language):                           
                                endorsement_y_cord=y_cord
                        if endorsement_check(head_check_text,language) and  not endorsement_head_found:
                            endorsement_head_found=True       
                            endorsement_second_category=True
                            endorsement_First_category=False
                            endorsement_single_page_head_found=True
                            endorsement_header_match_object=(font_size,font_family,bold,upper,bullet,boxed_text)
                            print("FOUND Endorsement heading type2 ext....",span.text)
                            print(endorsement_header_match_object)
                        elif (y_cord-endorsement_y_cord>50) and (endorsement_head_found and not endorsement_check(head_check_text,language) and endorsement_second_category and ((font_size,font_family,bold,upper,bullet,boxed_text)==endorsement_header_match_object or font_size > endorsement_header_match_object[0]) ) :
                            endorsement_head_found=False
                            print(endorsement_header_match_object,y_cord,endorsement_y_cord,y_cord-endorsement_y_cord)
                            endorsement_second_category=False
                            endorsement_single_page_head_found=True
                            print('FOUND Endorsement closure type 2 ext',span.text)
# 
                    if endorsement_head_found==True:
                        endorsement_text=endorsement_text+' '+span.text
    #                     else:
                    
                    
                    #INSURING AGREEMENTS
                    if  span_position==0 and (bold or font_size>= file_font_size_mode +2 ) and (font_size> file_font_size_mode or upper) and 5<len(head_check_text.strip())<80:                    
                        if ins_agg_check(head_check_text,language) and not ins_agg_head_found and not head_found and not ext_head_found and not cond_head_found: 
                            ins_agg_head_found=True
                            ins_agg_header_match_object=(font_size,font_family,bold,upper,bullet)
                            ins_agg_First_category=True
                            ins_agg_second_category=False
                            ins_agg_single_page_head_found=True
#                             print('FOUND INS AGG Heading for ext...',span.text)
                    
                        elif ins_agg_head_found and not ins_agg_check(head_check_text,language):
                            ins_agg_head_found=False
                            ins_agg_single_page_head_found=True
#                             print('FOUND INS AGG Closure for ext ...',span.text)
                
                    
                    ins_agg_words_title=[word.istitle() for word in head_check_text.split() if stopword_check(word,head_check_text,language) and not word.isdigit() ]
                    if span_position==0 and all(ins_agg_words_title) and head_check_text.strip() not in definitions and  (len(words_title) >1 or underlined_text )and 5<len(head_check_text.strip())<60 :
                        if ins_agg_check(head_check_text,language) and  not ins_agg_head_found and not head_found and not cond_head_found and not ext_head_found:
                            ins_agg_head_found=True
                            ins_agg_second_category=True
                            ins_agg_First_category=False
                            ins_agg_single_page_head_found=True
                            ins_agg_header_match_object=(font_size,font_family,bold,upper,bullet)
#                             print("FOUND INS AGG heading type2 ext....",span.text)
                        elif ins_agg_head_found and not ins_agg_check(head_check_text,language):
                            ins_agg_head_found=False
                            ins_agg_second_category=False
                            ins_agg_single_page_head_found=True
#                             print('FOUND INS AGG closure type 2 ext',span.text)
# 
                    if ins_agg_head_found==True:
                        ins_agg_text=ins_agg_text+' '+span.text
###############################             
                        
                    if font_size_match:
                        if font_size<=file_font_size_mode-1:
                            text_lis.append(' ')
                    text=span.text
                    text_plain=text
                   
                    for term in def_terms:
                        if term.lower().strip() in text.lower() and definitions[term][0].strip() not in text and term not in sub_definitions_in_page:
#                             def_page.append(definitions[term])
#                             print(pagenum)
                            src_str  = re.compile(re.escape(term), re.IGNORECASE)
                            text=src_str.sub('###{}@@{}%%%'.format(term,definitions[term][0]),text)
#                             print(text,'\n',term,'TERM!!!!@#$%^&*&^%#$%^&')
                            sub_definitions_in_page.append(term)
                            sub_page_def_list.append({'name':term,'text':definitions[term][0]})
#                             print(definitions_in_page)
                            break
#                     print(definitions_in_page)

                    
                    text_lis.append(text)
                    text_plain_lis.append(text_plain)
                    span_text_pos+=len(text_plain)
                    previous_span=span
                    

            if text_lis:
                if(int(pagenum) in [1,2]) and 'wording' in file:
                    continue

                text_lis=text_lis[:-1]
                text_para=''.join(text_lis)
                text_plain_lis=text_plain_lis[:-1]
                text_plain_para=''.join(text_plain_lis)
#                 print(len(re.findall('\d',text_para))>20,'content' in text_para,int(pagenum))
                if(len(re.findall('\.',text_para))>(len(text_para)/3) or (len(re.findall('\d',text_para))>50 and 'gbp' not in text_para.lower() and 'content' in text_para.lower())) : # and int(pagenum)<=2
#                     print('PASS')
                    text_para,text_plain_para=' ',' '
                    text_lis,text_plain_lis=[],[]
                    single_page_head_found=False
                    head_found=False
                    cond_head_found=False
                    ext_head_found=False
                    endorsement_head_found=False
                    continue
                else:
                    bold_lis=clean_pharses(bold_lis)
                    if pagenum in page_list and pagenum!='1':
                        pagenum=int(pagenum)+1
                    excl_flag,excl_count,excl_pos_lis=excl_extract(text_plain_lis,language)
                    cond_flag,cond_count,cond_pos_lis=cond_extract(text_plain_lis,language)
                    ext_flag,ext_count,ext_pos_lis=ext_extract(text_plain_lis,language)
                    if int(pagenum)==1:
                        if head_found or single_page_head_found or excl_count>0:
#                             print(pagenum,True,'!!!',excl_count)
                            excl_flag=True
                        else:
                            excl_flag=False
                    else:
                        if head_found or single_page_head_found:
#                             print(pagenum,True,'!!!')
                            excl_flag=True
#                             print(excl_flag,'EXCL')
                        else:
                            print(pagenum,excl_flag,'\n')
                    #####################
                    if cond_head_found or cond_single_page_head_found or cond_count>0:
                        cond_flag=True
#                         print('COND','TRUE')
                    else:
                        cond_flag=False
                    
                    if ext_head_found or ext_single_page_head_found or ext_count>0:
                        ext_flag=True
#                         print('EXT','TRUE',pagenum)
                    else:
#                         print('EXT','FALSE',pagenum)
                        ext_flag=False
                    if endorsement_head_found or endorsement_single_page_head_found:
                        endorsement_flag=True
                        print(pagenum,endorsement_flag)
                    else:
                        endorsement_flag=False
                        print(pagenum,endorsement_flag)
                    if ins_agg_head_found or ins_agg_single_page_head_found:
                        ins_agg_flag=True
#                         print(pagenum)
                    else:
#                         print(pagenum)
                        ins_agg_flag=False

                    for term,defs in definitions.items():
                        if defs[1]==pagenum:  ###defs[1] is the pagenumber
                            definition_text=definition_text+'          '+term+' '+defs[0]
                            def_in_page.append({'name':term,'text':defs[0]})        
#                     print(definition_text)
                    print(pagenum,definition_text)
                    if definition_text:
                        def_flag=True

                    if def_flag:
                        def_search_flag=True
#                         print('DEF','TRUE')
                    else:
                        def_search_flag=False
#                         print('DEF','FALSE')
                    
                    if pg_no>2:
                        endorsements=False
                    else:
                        endorsements=True
                    print('PAGE',pagenum)        
                    if text_para in local_indexed.keys():
                        text_para=text_para+' '
                    local_indexed[text_para]=(file,pagenum,bold_lis,text_plain_para,excl_flag,excl_count,excl_pos_lis,subdir,sub_page_def_list,country,language,definition_text,def_search_flag,endorsements,excl_text,condition_text,cond_flag,cond_count,cond_pos_lis,def_in_page,ext_text,ext_flag,ext_count,ext_pos_lis,span_text_pos,header_pos,endorsement_flag,endorsement_text,ins_agg_flag,ins_agg_text)
#                     print(sub_page_def_list,'DEF LIST!!!!!1')
                    sub_page_def_list=[]
                    span_text_pos=0
                    header_pos=[]
                    text_para,text_plain_para='','' 
                    text_plain_lis,text_lis,bold_lis=[],[],[]
                    definition_text=''
                    single_page_head_found=False
                    cond_single_page_head_found=False
                    ext_single_page_head_found=False
                    endorsement_single_page_head_found=False
                    ins_agg_single_page_head_found=False
                    condition_text=''
                    ext_text=''
                    excl_text=''
                    ins_agg_text=''
                    endorsement_text=''
                    def_in_page=[]
                    def_flag=False
    return(local_indexed)

In [25]:
def doc_pdf_path(path):
    file_list=os.listdir(path)
    for file in file_list:
        raw_filename=file[:file.rfind(".")]
        pdf_filename=raw_filename+'.pdf'
#         print(raw_filename,pdf_filename,'FILE!!')
        if pdf_filename in file_list:
            print('continued!!!!!!!!!!!!!')
            continue
        if file.endswith('doc') or file.endswith('docx'):
            print('processing!!!')
            wordToPdf(os.path.join(path,file))

In [27]:
# rootDir='V://COG//AU- Property//'
# for dirname,subdirlist,filelist in os.walk(rootDir):
#     print(dirname)
#     for subdir in subdirlist:
#         path=os.path.join(dirname,subdir)
# #         doc_pdf_path(path)

In [28]:
import os

rootDir='test/tes/'
country='uk'
local_indexed={}
language='english'
if 'spanish' in rootDir:
    language='spanish'
for dirname,subdirlist,filelist in os.walk(rootDir):
    for subdir in subdirlist:
        print(subdir, 'SUBDIR!!!!!!!!!!!!!')
        path=os.path.join(dirname,subdir)    
        local_indexed_subdir=create_local_index(path,subdir,language,country)
        local_indexed.update(local_indexed_subdir)

.ipynb_checkpoints SUBDIR!!!!!!!!!!!!!
Broker Wording SUBDIR!!!!!!!!!!!!!
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A.pdf
1 False
PASS
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
FOUND Endorsement heading type2 ext.... Additional terms and conditions 

(9, '', True, False, False, True)
17 True
18 True
(9, '', True, False, False, True) 16168 14385 1783
FOUND Endorsement closure type 2 ext Data Protection 

19 True
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
30 False
31 False
32 False
33 False
34 False
35 False
36 False
37 False
38 False
39 False
40 False
41 False
42 False
43 False
44 False
45 False
46 False
47 False
48 False
49 False
50 False
51 False
52 False
53 False
54 False
55 False
56 False
57 False
58 False
59 False
60 False
61 False
62 False
63 False
64 False
65 False
66 False
67 False
68 False
69 False
70 False
71 False
72 False
73 False
74 False
75 False
76

In [29]:
import fitz
import random
doc_text=""
doc = fitz.open('V:\COG\TRACK AND TRACE\Data\Documents\MasterPackage\')
for page in doc:
    doc_text= page.getText("text")+"\n"
    print(doc_text.find('Costa Rica'))

SyntaxError: EOL while scanning string literal (<ipython-input-29-c7d7881669f4>, line 4)

In [None]:
len(local_indexed)

In [None]:
from azure.cosmos import exceptions, CosmosClient, PartitionKey

cosmo_endpoint="https://nf-poc-cdb-sql.documents.azure.com"
cosmo_key="iEcEfrxYe0Fm9QtoxDrOpLvGsfzjowwybULlWT9Uz4XxV4RmOIAnRuLdgRFUu1LPU5Vwk3UGivRrPrxnk7083w=="
client = CosmosClient(cosmo_endpoint, cosmo_key)
database=client.get_database_client('policy-analysis')
container=database.get_container_client('tt_documents')

In [30]:
import random
for key,value in local_indexed.items():
    doc={}
    doc['text']=key
    doc['page']=int(value[1])
    if value[4]==True or value[0].find('Exclusion')!=-1:
        doc['IsExclusion']=True
    else:
        doc['IsExclusion']=False
    doc['doc_name']=value[0].replace('.pdf','')
    doc['id']=doc['doc_name']+' '+str(doc['page'])
    doc['bold_phrases']=value[2]
    doc['plain_text']=value[3]
#     print(value[3],'\n')
    doc['excl_count']=value[5]
    doc['excl_pos']=value[6]
    doc['folder']=value[7]
    doc['definitions']=[{'name':key,'text':value} for key,value in re.findall('###(.*?)@@(.*?)%%%',doc['text'])]
#     print(doc['definitions'],'\n\n\n')
    doc['country']=value[9]
    doc['language']=value[10]
    doc['definition_text']=value[11]
#     print(doc['definition_text'],'\n\n')
    if doc['definition_text']:
        doc['definition_flag']=True
    else:
        doc['definition_flag']=False
    if value[14]==True or 'endorsement' in value[0].lower(): 
        doc['endorsements']=True
    else:
        doc['endorsements']=value[13]
    doc['excl_text']=value[14]
    doc['cond_text']=value[15]
    doc['cond_flag']=value[16]
    doc['cond_count']=value[17]
    doc['cond_pos']=value[18]
    doc['definitions_in_page']=value[19]
    doc['ext_text']=value[20]
    doc['ext_flag']=value[21]
    doc['ext_count']=value[22]
    doc['ext_pos']=value[23]
    doc['span_len']=value[24]
    doc['header_pos']=value[25]
    doc['endorsement_flag']=value[26]
    doc['endorsement_text']=value[27]
#     if not doc['excl_text'] and not doc['cond_text'] and not doc['ext_text'] and not doc['definition_text']:
    doc['insuring_agreement_flag']=value[28]
    doc['insuring_text']=value[29]
    doc['effective_from']='{}-09-15T23:14:25.7251173Z'.format(random.randint(1995,2010))
    doc['effective_till']='{}-09-15T23:14:25.7251173Z'.format(random.randint(2011,2021))
#         print(doc['insuring_agreement_flag'])
#     else:
#         doc['insuring_agreement_flag']=False
#         doc['insuring_text']=''
#         print(doc['insuring_agreement_flag'])
#     try:       
#         container.create_item(body=doc)
#     except Exception as err:
#         print("Exception in clause search!!!!!!!!",str(err))
#         pass
#     print(doc['span_len'])
    if doc['insuring_agreement_flag']:
        print(doc['doc_name'],doc['page'],doc['definition_text'])
# #     print(doc['id'],value[1],'EXT ',doc['ext_flag'])
# #     if doc['page']==23:
#     print(doc['page'],doc['plain_text'].find('Costs, Fees and Expenses'),doc['header_pos'])
# #     if doc['page']==22:
# #     print(doc['doc_name'],doc['page'],doc['plain_text'],'EXCL!!!!!!!!!!',doc['excl_text'],'\n\n\n\n\n\n')
# #     print(doc['doc_name'],doc['page'],'PLAIN TEXT@!!!',doc['plain_text'],'\n\nEXCL!!!!!!!!',doc['excl_text'],'\n\n\n\n')

REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 1 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 3 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 4 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 5           no_name_found means premises , anywhere within the Territorial Limits,  owned, leased or occupied by the Insured for the purpose of the Business
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 6 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 7 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 8 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 9 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 10 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 11 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 14 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 15 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 16 
REDACTED_07852D6B-E59F-4DF9-AEA5-2BBFE5CEC60A 17           As used herein, a Communicable Disease means any    physical distress, illness, or disease caused or transmitted directly or indirectly by any vi

In [31]:
len(local_indexed)

113