In [1]:
#need to pip install tika
from tika import parser
import pandas as pd
import numpy as np

# Create Metadata Term Dictionary

In [2]:
#Return true if our current value is NaN

def isNan(a):
    return a != a

#generate dictionary 
#key is metadata field
#value is a list of term1, term2, term3
# we use this to match metadata fields to found terms in our sentence output document

def metadata_term_library(df_metadata):
    
    dic_metadata_terms = {}
    dic_metadata_category = {}
    
    for i,j in df_metadata.iterrows():
        
        dic_metadata_category[j['Metadata']] = [j['Metadata_Category']]
        dic_metadata_terms[j['Metadata']] = [j['Term1']]
                
        if not isNan(j['Term2']):
            dic_metadata_terms[j['Metadata']].append(j['Term2'])
        else:
            pass
        
        if not isNan(j['Term3']):
            dic_metadata_terms[j['Metadata']].append(j['Term3'])
        else:
            pass
        
    return(dic_metadata_terms, dic_metadata_category)
          

# Simplified data structure for Term search

In [3]:
import spacy
from spacy import displacy
import en_core_web_sm
from spacy import displacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler 

def sent_segment(txt):
    """ sentence tokenization

    Parameters:
    txt : tex to tokenize into sentences
    Returns: list of sentences

    """

    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = English() 

    # A simple pipeline component, to allow custom sentence boundary detection logic 
    # that doesn’t require the dependency parse. It splits on punctuation by default
    sbd = nlp.create_pipe('sentencizer')

    # Add the component to the pipeline
    nlp.add_pipe(sbd)

    #nlp is used to create documents with linguistic annotations.
    doc = nlp(txt)   

    # create list of sentence tokens
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)

    return sents_list 

In [21]:
#synonym finder

import pandas as pd
from nltk.corpus import wordnet
from itertools import chain

# #find synonyms based on the keyword and build into a dataframe
def get_synonyms(term_dic):

    #list of tuples containing (category, original term, synonym)
    term_tuples = []

    
    ####Issues passing number into wordnet. 
    L = []
    for key in term_dic:

        syn = wordnet.synsets(str(term_dic[key][0]))

            
        syn_list = [term_dic[key][0]]

        #flatten all lists by chain, remove duplicates by set
        lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))

        for i in lemmas[:3]:
            syn_list.append(i)

        syn_list = list(set(syn_list))

        for syn in syn_list:
            L.append((key, term_dic[key][0], syn))

            
    return (pd.DataFrame(L, columns=['Metadata_Field','Original_Term','Synonym_Term']))    


In [5]:
# #dataframe containing Metadata field, original term1, and the synonym term1

# df_term1 = get_synonyms(dic_meta_term)

In [23]:
# search proper keyword in the list of sentences
# if found, grab surrounding lines, combine, and add to 3rd dataframe

#sentence_df contains list of sentence segment in each row
#term_df contains the category, original term, and synonym term in each row
# Want to search the synonym term in sentence rows. Then if match found, add all those 4 columns to new dataframe


def search_term(sentence_df, term_df):
    
    category_column_names = ['Metadata_Field','Original_Keyword', 'Searched_Keyword', 'Sentence_Number','Text']
    df_metadata = pd.DataFrame(columns = category_column_names)

    array_metadata = df_metadata.values
    
    for i,j in sentence_df.iterrows():
        for m,n in term_df.iterrows():
            if str(n['Synonym_Term']) in j['Sentences'].lower():
                ####### do the iteration, search the term and add to dataframe
                
                df_found = pd.DataFrame([(n['Metadata_Field'],n['Original_Term'], n['Synonym_Term'], i, j['Sentences'])])
                
                array_metadata = np.concatenate(\
                        (array_metadata, df_found.values), axis=0)

    df_metadata = pd.DataFrame(array_metadata, columns = category_column_names)
    
    return df_metadata


#currently adding things to an np.array continually. then at the end, add it to the data frame


In [6]:
#create context. takes current line of text and associated index. 
# lokos at the sentence dataframe and grabs correct rows.
# allows us to search multiple segments for each original term

#pass the original sentence dataframe and current row of the metadata dataframe
def create_context(sentence_df, metadata_df, idx):
    
    sentence_idx = metadata_df.iloc[idx]['Sentence_Number']
    
    try:
        context = sentence_df.iloc[sentence_idx - 1]['Sentences'] + \
        sentence_df.iloc[sentence_idx]['Sentences'] + \
        sentence_df.iloc[sentence_idx + 1]['Sentences']
    except:
        
        context = sentence_df.iloc[sentence_idx]['Sentences']
    
    return context

In [28]:
#Uses a secondary library of terms
# Keys in this library will match the 'original keyword' column in the metadata_df
# Iterate through associated list of terms
# If secondary term found, add it to new column
# adds to an existing dataframe with enough columns so just need to generate list and add to new column

#Need to add greater context to each line. grab sentence from sentence dataframe

def search_secondary(sentence_df, metadata_df, term_dic):
    
    secondary_term_outer = []
    secondary_syn_outer = []
    secondary_context_outer = []
    
    for i,j in metadata_df.iterrows():
        
        context = create_context(sentence_df, metadata_df, i)
        
        secondary_term_inner = [] 
        secondary_syn_inner = []
        secondary_context_inner = None
        term1 = term_dic[j['Metadata_Field']]
        
    
        if len(term1) > 1:

            for k in range(1,len(term1)):

                item = str(term1[k])

                syn = wordnet.synsets(item)

                syn_list = [item]

                #flatten all lists by chain, remove duplicates by set
                lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))                

                for m in lemmas[:3]:
                    syn_list.append(m)

                syn_list = list(set(syn_list))

                for syn in syn_list:

                    if syn in context.lower() and secondary_context_inner == None:
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)
                        secondary_context_inner = context

                    elif syn in context.lower():
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)

                    else:
                        pass
        else:
            pass


                                      
        secondary_term_outer.append(secondary_term_inner)
        secondary_syn_outer.append(secondary_syn_inner)
        secondary_context_outer.append(secondary_context_inner)
    
    metadata_df['Secondary_Term'] = secondary_term_outer
    metadata_df['Secondary_Synonym'] = secondary_syn_outer
    metadata_df['Secondary_Context'] = secondary_context_outer
    
    return metadata_df

In [1]:
#inputs metadata dictionary
#sentence found keyword dataframe
#iterate through sentence dataframe
# look at the combined terminology list of original_keyword and seondary_term
# see if metadata search temrs are contained in the list
# if all terms are in the sentence term list, then add the associated metadata term to the row as possible match

def combine_keyword_list(df_keyword):
    
    complete_term_list = []
    
    for i,j in df_keyword.iterrows():
        secondary_term_tolist = eval(j['Secondary_Term'])
        original_keyword_string = str(j['Original_Keyword'])
        
        #print(type(original_keyword_string))
        
        #print(secondary_term_tolist)
        secondary_term_tolist.append(original_keyword_string)
        complete_terms = secondary_term_tolist
        
        complete_term_list.append(complete_terms)
        
    df_keyword['Complete_Term_List'] = complete_term_list
    
    return df_keyword

##test
#combine_keyword_list(df_sentence_keywords)

In [None]:
# check each row to see if metadata fields are in the keywords list
# create new column in dataframe that shows all possible metadata matches

def metadata_match(df_keyword, dic_metadata, dic_category):
    
    metadata_matches_outer = []
    category1_match_outer = []
    category2_match_outer = []
    
    for i,j in df_keyword.iterrows():
        
        metadata_matches_inner = []
        category1_match_inner = []
        category2_match_inner = []
                   
        term_list = j['Complete_Term_List']
        
        for key in dic_metadata:
            
            if all(elem in term_list for elem in dic_metadata[key]):
                metadata_matches_inner.append(key)
                category1_match_inner.append(dic_category[key][0])
                   
                try: 
                    category2_match_inner.append(dic_category[key][1])
                except:
                    category2_match_inner.append(None)
                   
            else:
                pass
            
        metadata_matches_outer.append(metadata_matches_inner)
        category1_match_outer.append(list(set(category1_match_inner)))
        category2_match_outer.append(list(set(category2_match_inner)))   
    
    df_keyword['Metadata_Matches'] = metadata_matches_outer
    df_keyword['Metadata_Category'] = category1_match_outer
    df_keyword['Metadata_SubCategory'] = category2_match_outer
    
    #remove lines with know metadata field match
    df_keyword = df_keyword[df_keyword['Metadata_Matches'].map(lambda d: len(d)) > 0]
    
    #print(df_keyword['Metadata_Matches'][1])
    return df_keyword
    

# Process
Start with input dictionary containing category and keyword.  
Use keyword to search synonyms. Create a tuple of (category, keyword, synonym) and append those to a list.  
Iterate through list and search for synonym in text of df_sentence.  
Create a dataframe of ['category', 'keyword', 'synonym', 'text']  


Best algorithm for this? Right now looking at O(n_squared)
    - (number of terms) * (number of sentences)
    

In [9]:
from os import listdir
from os.path import isfile, join

#create contract list
contract_path = 'C:\\Users\\Hello\\Desktop\\metric_stream\\v0.022\\metadata_pipeline\\data\\'
#contract_names = listdir(contract_path)
#remove .pdf Use for file name in csv export at end

contract_files = [contract_path + f for f in listdir(contract_path) if isfile(join(contract_path, f))]

#can loop through list for contract files
parsed_contract = parser.from_file(contract_files[0])
pdf_doc = parsed_contract['content']


#print(contract_files)

2020-06-11 16:36:46,777 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


# Generate sentence list

In [18]:
#dic_meta_term is list of terminology associated with metadata field
#dic_meta_category is mapping of metadata field to a metadata category (original metadata field provided to us)
metadata_file = pd.read_excel('metadata_library.xlsx')
dic_meta_term, dic_meta_category = metadata_term_library(metadata_file)

In [11]:
sentence_list = sent_segment(pdf_doc)
sentence_list_clean = []

for i in range(0,len(sentence_list)):
    remove_newline = sentence_list[i].replace('\n', '')
    sentence_list_clean.append(remove_newline)
    
#Write sentence list dataframe to file

dic_sentence = {'Sentences' : sentence_list_clean} 
df_sentence = pd.DataFrame.from_dict(dic_sentence)
df_sentence.head()

#df_sentence.to_csv('sentences0604.csv')

Unnamed: 0,Sentences
0,MetricStream 1 | P a g e MS-INF-CLD-PRM/...
1,3 This Order Form (“Order Form”) supplements t...
2,MetricStream and Customer may be referred to i...
3,"For good and valuable consideration, the recei..."
4,Type of License: Annual Subscription License.


# Match sentences with Metadata terminology

In [30]:
#dataframe containing Metadata field, original term1, and the synonym term1
df_words = get_synonyms(dic_meta_term)


df_metadata0 = search_term(df_sentence, df_words)

df_metadata_2 = search_secondary(df_sentence, df_metadata0, dic_meta_term)
#df_metadata_2.to_csv('test2.csv')

# Match found metadata terms to original metadata field

In [None]:
df_combined_terms = combine_keyword_list(df_metadata_2)

In [None]:
#df_combined_terms generated above
#AAAAAA associates the combined term list to a metadata field
#BBBBBBB associates the metadata field with Group and Sub-group

#metadata_match(df_combined_terms, AAAAAAA,BBBBBBBB)