# Simplified data structure for Term search

In [1]:
#need to pip install tika
from tika import parser
import pandas as pd
import numpy as np

In [2]:
#change file path

k = parser.from_file('c:\\users\\hello\\desktop\\metric_stream\\v0.02\\data\\Converted contract - Acciona (Spain) ASP Executed.pdf')

In [3]:
pdf_doc = k['content']
#print(pdf_doc)

In [4]:
#Library of terminology

category_term1 = \
    {'Time': ['hours', 'days', 'date', 'term', 'termination'], \
    'Person': ['owner', 'client', 'contact', 'name'], \
    'Agreement': ['agreement', 'renewal', 'backup', 'breach', 'churn', 'user', 'support', 'uptime'],\
    'Financial': ['value', 'fee', 'currency', 'escrow', 'renewal', 'term', 'price'],\
    'Address': ['address'], \
    'Product': ['SKU', 'license', 'user'], \
    'Legal': ['contract', 'legal', 'law', 'liability', 'indemnity', 'term', 'penalty', 'renegotiation'],\
    'Financial': ['fee']}

term1_term2 = \
    {'hours': ['12', '24', '48'],\
     'days': ['15', '30'],\
     'owner': ['account', 'opportunity'],\
     'agreement': ['type'],\
     'value': ['renewal'],\
     'fee': ['automatically','cloud','license','partner','platform','support','product'],\
     'renewal': ['automatically', 'date'],\
     'address': ['billing', 'corporate'],\
     'churn': ['acv', 'type'],\
     'date': ['churn','effective','executed','renewal','notice','subscription','term'],\
     'client': ['consent', 'name'],\
     'SKU': ['cloud', 'product'],\
     'contract': ['term', 'relationship'],\
     'legal': ['customer', 'contact'],\
     'law': ['governing'],\
     'license': ['type'],\
     'user': ['license', 'maximum', 'total'],\
     'liability': ['limitation'],\
     'contact': ['operation'],\
     'indemnity': ['nonstandard'],\
     'term': ['order', 'payment', 'renewal'],\
     'name': ['partner'],\
     'penalty': ['sla'],\
     'renegotiation': ['comments'],\
     'price': ['renewal'],\
     'termination': ['convenience'],\
     'support': ['type']}


In [5]:
import spacy
from spacy import displacy
import en_core_web_sm
from spacy import displacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler 

def sent_segment(txt):
    """ sentence tokenization

    Parameters:
    txt : tex to tokenize into sentences
    Returns: list of sentences

    """

    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = English() 

    # A simple pipeline component, to allow custom sentence boundary detection logic 
    # that doesn’t require the dependency parse. It splits on punctuation by default
    sbd = nlp.create_pipe('sentencizer')

    # Add the component to the pipeline
    nlp.add_pipe(sbd)

    #nlp is used to create documents with linguistic annotations.
    doc = nlp(txt)   

    # create list of sentence tokens
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)

    return sents_list 

In [6]:
#synonym finder

import pandas as pd
from nltk.corpus import wordnet
from itertools import chain

# #find synonyms based on the keyword and build into a dataframe
def get_synonyms(term_dic):

    #list of tuples containing (category, original term, synonym)
    term_tuples = []

    
    ####Issues passing number into wordnet. 
    L = []
    for key in term_dic:
        for item in term_dic[key]:
            syn = wordnet.synsets(item)
            
            syn_list = [item]

            #flatten all lists by chain, remove duplicates by set
            lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))
            
            for i in lemmas[:3]:
                syn_list.append(i)
    
            syn_list = list(set(syn_list))
        
            for syn in syn_list:
                L.append((key, item, syn))

            
    return (pd.DataFrame(L, columns=['Category','Original_Keyword','Searched_Keyword']))    


In [7]:
# dic_ex = {'fine': ['fee']}
# get_synonyms(dic_ex)



In [20]:
# search proper keyword in the list of sentences
# if found, grab surrounding lines, combine, and add to 3rd dataframe

#sentence_df contains list of sentence segment in each row
#term_df contains the category, original term, and synonym term in each row
# Want to search the synonym term in sentence rows. Then if match found, add all those 4 columns to new dataframe


def search_term(sentence_df, term_df):
    
    category_column_names = ['Category', 'Original_Keyword', 'Searched_Keyword', 'sentence_number','Text']
    df_metadata = pd.DataFrame(columns = category_column_names)

    array_metadata = df_metadata.values
    
    for i,j in sentence_df.iterrows():
        for m,n in term_df.iterrows():
            if n['Searched_Keyword'] in j['Sentences'].lower():
                ####### do the iteration, search the term and add to dataframe
                
                df_found = pd.DataFrame([(n['Category'], n['Original_Keyword'], n['Searched_Keyword'], i, j['Sentences'])])
                
                array_metadata = np.concatenate(\
                        (array_metadata, df_found.values), axis=0)

    df_metadata = pd.DataFrame(array_metadata, columns = category_column_names)
    
    return df_metadata


#currently adding things to an np.array continually. then at the end, add it to the data frame


In [9]:
#create context. takes current line of text and associated index. 
# lokos at the sentence dataframe and grabs correct rows.
# allows us to search multiple segments for each original term

#pass the original sentence dataframe and current row of the metadata dataframe
def create_context(sentence_df, metadata_df, idx):
    
    sentence_idx = metadata_df.iloc[idx]['sentence_number']
    
    try:
        context = sentence_df.iloc[sentence_idx - 1]['Sentences'] + \
        sentence_df.iloc[sentence_idx]['Sentences'] + \
        sentence_df.iloc[sentence_idx + 1]['Sentences']
    except KeyError:
        
        context = sentence_df.iloc[sentence_idx]['Sentences']
    
    return context

In [19]:
#Uses a secondary library of terms
# Keys in this library will match the 'original keyword' column in the metadata_df
# Iterate through associated list of terms
# If secondary term found, add it to new column
# adds to an existing dataframe with enough columns so just need to generate list and add to new column

#Need to add greater context to each line. grab sentence from sentence dataframe

def search_secondary(sentence_df, metadata_df, term_dic):
    
    secondary_term_outer = []
    secondary_syn_outer = []
    secondary_context_outer = []
    
    for i,j in metadata_df.iterrows():
        
        context = create_context(sentence_df, metadata_df, i)
        
        secondary_term_inner = [] 
        secondary_syn_inner = []
        secondary_context_inner = None
        
        for key in term_dic:
            
            for item in term_dic[key]:

                syn = wordnet.synsets(item)
                
                syn_list = [item]
                
                #flatten all lists by chain, remove duplicates by set
                lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))                
                
                for m in lemmas[:3]:
                    syn_list.append(m)

                syn_list = list(set(syn_list))

                
                for syn in syn_list:

                    if syn in context.lower() and key == j['Original_Keyword'] and secondary_context_inner == None:
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)
                        secondary_context_inner = context

                    elif syn in context.lower() and key == j['Original_Keyword']:
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)

                    else:
                        pass

                
        secondary_term_outer.append(secondary_term_inner)
        secondary_syn_outer.append(secondary_syn_inner)
        secondary_context_outer.append(secondary_context_inner)
    
    metadata_df['Secondary_Term'] = secondary_term_outer
    metadata_df['Secondary_Synonym'] = secondary_syn_outer
    metadata_df['Secondary_Context'] = secondary_context_outer
    
    return metadata_df

# Process
Start with input dictionary containing category and keyword.  
Use keyword to search synonyms. Create a tuple of (category, keyword, synonym) and append those to a list.  
Iterate through list and search for synonym in text of df_sentence.  
Create a dataframe of ['category', 'keyword', 'synonym', 'text']  


Best algorithm for this? Right now looking at O(n_squared)
    - (number of terms) * (number of sentences)
    

In [11]:
sentence_list = sent_segment(pdf_doc)
sentence_list_clean = []

for i in range(0,len(sentence_list)):
    remove_newline = sentence_list[i].replace('\n', '')
    sentence_list_clean.append(remove_newline)
    
#Write sentence list dataframe to file

dic_sentence = {'Sentences' : sentence_list_clean} 
df_sentence = pd.DataFrame.from_dict(dic_sentence)
df_sentence.head()

#df_sentence.to_csv('sentences0604.csv')

Unnamed: 0,Sentences
0,DocuSign Envelope ID: F3F9AB55-4121-49E9-BE97...
1,"Europa , 18 P.E. La Moraleja 28108 Alcobendas ..."
2,"“Order Form"" Attached as Exhibit A “Service Le..."
3,This Agreement shall consist of the Metric Str...
4,This Agreement constitutes the entire agreemen...


In [12]:
#df_metadata0.to_csv('metadata_test1.csv')

In [21]:
df_words = get_synonyms(category_term1)
df_metadata0 = search_term(df_sentence, df_words)

In [22]:
df_metadata0.head()

Unnamed: 0,Category,Original_Keyword,Searched_Keyword,sentence_number,Text
0,Time,date,date,0,DocuSign Envelope ID: F3F9AB55-4121-49E9-BE97...
1,Agreement,agreement,agreement,0,DocuSign Envelope ID: F3F9AB55-4121-49E9-BE97...
2,Legal,law,law,0,DocuSign Envelope ID: F3F9AB55-4121-49E9-BE97...
3,Time,term,condition,1,"Europa , 18 P.E. La Moraleja 28108 Alcobendas ..."
4,Time,term,term,1,"Europa , 18 P.E. La Moraleja 28108 Alcobendas ..."


In [23]:
df_metadata0.tail()

Unnamed: 0,Category,Original_Keyword,Searched_Keyword,sentence_number,Text
444,Agreement,backup,support,277,MetricStream will not store Customer data on a...
445,Agreement,support,support,277,MetricStream will not store Customer data on a...
446,Time,term,term,278,Metric Stream will not store customer data for...
447,Agreement,agreement,agreement,278,Metric Stream will not store customer data for...
448,Legal,term,term,278,Metric Stream will not store customer data for...


In [25]:
df_metadata_2 = search_secondary(df_sentence, df_metadata0, term1_term2)
df_metadata_2.to_csv("secondary_test.csv")