In [1]:
#need to pip install tika
from tika import parser
import pandas as pd
import numpy as np

# Create Metadata Term Dictionary

In [2]:
#Return true if our current value is NaN

def isNan(a):
    return a != a

#generate dictionary 
#key is metadata field
#value is a list of term1, term2, term3
# we use this to match metadata fields to found terms in our sentence output document

def metadata_term_library(df_metadata):
    
    dic_metadata_terms = {}
    dic_metadata_original = {}
    dic_metadata_category = {}
    dic_metadata_subcategory = {}
    
    for i,j in df_metadata.iterrows():
        
        dic_metadata_original[j['Metadata']] = [j['Metadata_Original']]
        dic_metadata_category[j['Metadata']] = [j['Category']]
        dic_metadata_subcategory[j['Metadata']] = [j['Sub-Category']]

        dic_metadata_terms[j['Metadata']] = [j['Term1']]
                
        if not isNan(j['Term2']):
            dic_metadata_terms[j['Metadata']].append(j['Term2'])
        else:
            pass
        
        if not isNan(j['Term3']):
            dic_metadata_terms[j['Metadata']].append(j['Term3'])
        else:
            pass
        
    return(dic_metadata_terms, dic_metadata_original, dic_metadata_category, dic_metadata_subcategory)
          

# Simplified data structure for Term search

In [3]:
import spacy
from spacy import displacy
import en_core_web_sm
from spacy import displacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler 

def sent_segment(txt):
    """ sentence tokenization

    Parameters:
    txt : tex to tokenize into sentences
    Returns: list of sentences

    """

    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = English() 

    # A simple pipeline component, to allow custom sentence boundary detection logic 
    # that doesn’t require the dependency parse. It splits on punctuation by default
    sbd = nlp.create_pipe('sentencizer')

    # Add the component to the pipeline
    nlp.add_pipe(sbd)

    #nlp is used to create documents with linguistic annotations.
    doc = nlp(txt)   

    # create list of sentence tokens
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)

    return sents_list 

In [4]:
#synonym finder

import pandas as pd
from nltk.corpus import wordnet
from itertools import chain

# #find synonyms based on the keyword and build into a dataframe
def get_synonyms(term_dic):

    #list of tuples containing (category, original term, synonym)
    term_tuples = []

    
    ####Issues passing number into wordnet. 
    L = []
    for key in term_dic:

        syn = wordnet.synsets(str(term_dic[key][0]))

            
        syn_list = [term_dic[key][0]]

        #flatten all lists by chain, remove duplicates by set
        lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))

        for i in lemmas[:3]:
            syn_list.append(i)

        syn_list = list(set(syn_list))

        for syn in syn_list:
            L.append((key, term_dic[key][0], syn))

            
    return (pd.DataFrame(L, columns=['Metadata_Field','Original_Term','Synonym_Term']))    


In [5]:
# #dataframe containing Metadata field, original term1, and the synonym term1

# df_term1 = get_synonyms(dic_meta_term)

In [4]:
# search proper keyword in the list of sentences
# if found, grab surrounding lines, combine, and add to 3rd dataframe

#sentence_df contains list of sentence segment in each row
#term_df contains the category, original term, and synonym term in each row
# Want to search the synonym term in sentence rows. Then if match found, add all those 4 columns to new dataframe


def search_term(sentence_df, term_df):
    
    category_column_names = ['Metadata_Field','Original_Keyword', 'Searched_Keyword', 'Sentence_Number','Text']
    df_metadata = pd.DataFrame(columns = category_column_names)

    array_metadata = df_metadata.values
    
    for i,j in sentence_df.iterrows():
        for m,n in term_df.iterrows():
            if str(n['Synonym_Term']) in j['Sentences'].lower():
                ####### do the iteration, search the term and add to dataframe
                
                df_found = pd.DataFrame([(n['Metadata_Field'],n['Original_Term'], n['Synonym_Term'], i, j['Sentences'])])
                
                array_metadata = np.concatenate(\
                        (array_metadata, df_found.values), axis=0)

    df_metadata = pd.DataFrame(array_metadata, columns = category_column_names)
    
    return df_metadata


#currently adding things to an np.array continually. then at the end, add it to the data frame


In [5]:
#create context. takes current line of text and associated index. 
# lokos at the sentence dataframe and grabs correct rows.
# allows us to search multiple segments for each original term

#pass the original sentence dataframe and current row of the metadata dataframe
def create_context(sentence_df, metadata_df, idx):
    
    sentence_idx = metadata_df.iloc[idx]['Sentence_Number']
    
    try:
        context = sentence_df.iloc[sentence_idx - 1]['Sentences'] + \
        sentence_df.iloc[sentence_idx]['Sentences'] + \
        sentence_df.iloc[sentence_idx + 1]['Sentences']
    except:
        
        context = sentence_df.iloc[sentence_idx]['Sentences']
    
    return context

In [6]:
#Uses a secondary library of terms
# Keys in this library will match the 'original keyword' column in the metadata_df
# Iterate through associated list of terms
# If secondary term found, add it to new column
# adds to an existing dataframe with enough columns so just need to generate list and add to new column

#Need to add greater context to each line. grab sentence from sentence dataframe

def search_secondary(sentence_df, metadata_df, term_dic):
    
    secondary_term_outer = []
    secondary_syn_outer = []
    secondary_context_outer = []
    
    for i,j in metadata_df.iterrows():
        
        context = create_context(sentence_df, metadata_df, i)
        
        secondary_term_inner = [] 
        secondary_syn_inner = []
        secondary_context_inner = None
        term1 = term_dic[j['Metadata_Field']]
        
    
        if len(term1) > 1:

            for k in range(1,len(term1)):

                item = str(term1[k])

                syn = wordnet.synsets(item)

                syn_list = [item]

                #flatten all lists by chain, remove duplicates by set
                lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))                

                for m in lemmas[:3]:
                    syn_list.append(m)

                syn_list = list(set(syn_list))

                for syn in syn_list:

                    if syn in context.lower() and secondary_context_inner == None:
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)
                        secondary_context_inner = context

                    elif syn in context.lower():
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)

                    else:
                        pass
        else:
            secondary_context_inner = context


                                      
        secondary_term_outer.append(secondary_term_inner)
        secondary_syn_outer.append(secondary_syn_inner)
        secondary_context_outer.append(secondary_context_inner)
    
    metadata_df['Secondary_Term'] = secondary_term_outer
    metadata_df['Secondary_Synonym'] = secondary_syn_outer
    metadata_df['Secondary_Context'] = secondary_context_outer
    
    return metadata_df

In [7]:
#inputs metadata dictionary
#sentence found keyword dataframe
#iterate through sentence dataframe
# look at the combined terminology list of original_keyword and seondary_term
# see if metadata search temrs are contained in the list
# if all terms are in the sentence term list, then add the associated metadata term to the row as possible match

def combine_keyword_list(df_keyword):
    
    complete_term_list = []
    
    for i,j in df_keyword.iterrows():
        secondary_term_tolist = eval(str(j['Secondary_Term']))
        original_keyword_string = str(j['Original_Keyword'])
        
        #print(type(original_keyword_string))
        
        #print(secondary_term_tolist)
        secondary_term_tolist.append(original_keyword_string)
        complete_terms = secondary_term_tolist
        
        complete_term_list.append(complete_terms)
        
    df_keyword['Complete_Term_List'] = complete_term_list
    
    return df_keyword

##test
#combine_keyword_list(df_sentence_keywords)

In [8]:
# check each row to see if metadata fields are in the keywords list
# create new column in dataframe that shows all possible metadata matches

def metadata_match(df_keyword, dic_metadata, dic_original, dic_cat, dic_subcat):
    
    metadata_matches_outer = []
    metadata_original_match_outer = []
    metadata_category_outer = []
    metadata_subcategory_outer = []
    
    
    for i,j in df_keyword.iterrows():
        
        metadata_matches_inner = []
        original_match_inner = []
        category_match_inner = []
        subcategory_match_inner = []

                   
        term_list = j['Complete_Term_List']
        
        for key in dic_metadata:
            
            if all(elem in term_list for elem in dic_metadata[key]):
                metadata_matches_inner.append(key)
                original_match_inner.append(dic_original[key][0])
                category_match_inner.append(dic_cat[key][0])
                subcategory_match_inner.append(dic_subcat[key][0])
                                   
            else:
                pass
            
        metadata_matches_outer.append(metadata_matches_inner)
        metadata_original_match_outer.append(list(set(original_match_inner)))
        metadata_category_outer.append(list(set(category_match_inner)))
        metadata_subcategory_outer.append(list(set(subcategory_match_inner)))
    
    df_keyword['Metadata_Matches'] = metadata_matches_outer
    df_keyword['Metadata_Original'] = metadata_original_match_outer
    df_keyword['Metadata_Category'] = metadata_category_outer
    df_keyword['Metadata_Subcategory'] = metadata_subcategory_outer
    
    #remove lines with know metadata field match
    df_keyword = df_keyword[df_keyword['Metadata_Matches'].map(lambda d: len(d)) > 0]
    
    #print(df_keyword['Metadata_Matches'][1])
    return df_keyword
    

# Process
Start with input dictionary containing category and keyword.  
Use keyword to search synonyms. Create a tuple of (category, keyword, synonym) and append those to a list.  
Iterate through list and search for synonym in text of df_sentence.  
Create a dataframe of ['category', 'keyword', 'synonym', 'text']  


Best algorithm for this? Right now looking at O(n_squared)
    - (number of terms) * (number of sentences)
    

In [9]:
# from os import listdir
# from os.path import isfile, join

#create contract list
# contract_path = 'C:\\Users\\Hello\\Desktop\\metric_stream\\v0.022\\metadata_pipeline\\data\\'
#contract_names = listdir(contract_path)
#remove .pdf Use for file name in csv export at end

# contract_files = [contract_path + f for f in listdir(contract_path) if isfile(join(contract_path, f))]

#can loop through list for contract files
# parsed_contract = parser.from_file(contract_files[0])
# pdf_doc = parsed_contract['content']


#print(contract_files)

2020-06-16 08:36:15,822 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [None]:
from os import listdir
from os.path import isfile, join


#put your contract path + file name in contract_files

contract_files = 'C:\\Users\\Hello\\Desktop\\metric_stream\\v0.022\\metadata_pipeline\\data\\'
parsed_contract = parser.from_file(contract_files)
pdf_doc = parsed_contract['content']


#print(contract_files)

# Generate sentence list

In [12]:
#dic_meta_term is list of terminology associated with metadata field
#dic_meta_category is mapping of metadata field to a metadata category (original metadata field provided to us)

metadata_file = pd.read_excel('C:\\Users\\Hello\\Desktop\\metric_stream\\v0.022\\metadata_pipeline\\metadata_documents\\metadata_library.xlsx')
dic_meta_term, dic_meta_original, dic_meta_cat, dic_meta_subcat = metadata_term_library(metadata_file)

In [13]:
#Remove newlines
#Remove sentence that are too short (usually 1 or 2 symbols)

sentence_list = sent_segment(pdf_doc)
sentence_list_clean = []

for i in range(0,len(sentence_list)):
    remove_newline = sentence_list[i].replace('\n', '')
    if len(sentence_list[i]) > 4:
        sentence_list_clean.append(remove_newline)
    else: pass
    
#Write sentence list dataframe to file

dic_sentence = {'Sentences' : sentence_list_clean} 
df_sentence = pd.DataFrame.from_dict(dic_sentence)

#df_sentence.head()
#df_sentence.to_csv('sentences0615.csv')

# Match sentences with Metadata terminology

In [20]:
#dataframe containing Metadata field, original term1, and the synonym term1
df_words = get_synonyms(dic_meta_term)


df_metadata0 = search_term(df_sentence, df_words)


df_metadata_2 = search_secondary(df_sentence, df_metadata0, dic_meta_term)

#df_metadata_2.to_csv('test2.csv')

# Match found metadata terms to original metadata field and category

In [21]:
df_combined_terms = combine_keyword_list(df_metadata_2)

In [16]:
# df_combined_terms.head()

In [22]:
### MAtch score (maybe)-> number of fields picked up (minimum 50% or something)

df_metadata_match = metadata_match(df_combined_terms, dic_meta_term, dic_meta_original, dic_meta_cat, dic_meta_subcat)
df_metadata_match.head()

Unnamed: 0,Metadata_Field,Original_Keyword,Searched_Keyword,Sentence_Number,Text,Secondary_Term,Secondary_Synonym,Secondary_Context,Complete_Term_List,Metadata_Matches,Metadata_Original,Metadata_Category,Metadata_Subcategory
0,99,99,ic,0,Page 1 of 17 SOFTWARE LICENSE AGREEMENT This...,[],[],"MetrieStream, Inc. By (Print Name) Title ...",[99],[99],[99],[Value],[nan]
1,99.8,99,ic,0,Page 1 of 17 SOFTWARE LICENSE AGREEMENT This...,[],[],"MetrieStream, Inc. By (Print Name) Title ...",[99],[99],[99],[Value],[nan]
2,99.9,99,ic,0,Page 1 of 17 SOFTWARE LICENSE AGREEMENT This...,[],[],"MetrieStream, Inc. By (Print Name) Title ...",[99],[99],[99],[Value],[nan]
3,99.99,99,ic,0,Page 1 of 17 SOFTWARE LICENSE AGREEMENT This...,[],[],"MetrieStream, Inc. By (Print Name) Title ...",[99],[99],[99],[Value],[nan]
4,Churn Effective Date,date,date,0,Page 1 of 17 SOFTWARE LICENSE AGREEMENT This...,[effective],[effective],"MetrieStream, Inc. By (Print Name) Title ...","[effective, date]",[Effective Date],[Effective Date],[Time],[Contact]


In [23]:
df_metadata_match.to_csv('test_metadata_match0615.csv')

# Simplify Dataframe

In [26]:
# df_metadata_simplified = df_metadata_match[['Metadata_Category','Metadata_Original','Metadata_Field','Secondary_Context']]
# df_metadata_simplified.head()

Unnamed: 0,Metadata_Category,Metadata_Original,Metadata_Field,Secondary_Context
0,[Value],[99],99,"MetrieStream, Inc. By (Print Name) Title ..."
1,[Value],[99],99.8,"MetrieStream, Inc. By (Print Name) Title ..."
2,[Value],[99],99.9,"MetrieStream, Inc. By (Print Name) Title ..."
3,[Value],[99],99.99,"MetrieStream, Inc. By (Print Name) Title ..."
4,[Time],[Effective Date],Churn Effective Date,"MetrieStream, Inc. By (Print Name) Title ..."


In [22]:
# import pandas as pd
# #df_metadata_simplified.to_csv('test2.csv')
# df_metadata_simplified = pd.read_csv('test2.csv')

In [40]:
# df_metadata_simplified.head()

Unnamed: 0,Metadata_Category,Metadata_Original,Metadata_Field,Secondary_Context
0,['Agreement'],['Agreement Type'],Amendment,1.3 Change in Scope.If Bank requests an expan...
1,['Agreement'],['Agreement Type'],Amendment,If Company requests a change in the scope of S...
2,['Agreement'],['Agreement Type'],Amendment,Company agrees to use reasonable efforts to id...
3,['Agreement'],['Agreement Type'],Amendment,"For purposes of understanding, these Deliverab..."
4,['Agreement'],['backup'],backup,Licensee shall not make or permit others to ma...


# Strong Match
look for exact match of metadata term

In [43]:
# #look for exact term in context

# def strong_match(df_metadata):
     
#     for i,j in df_metadata.iterrows():
#         if str(j['Metadata_Field']).lower() in str(j['Secondary_Context']).lower():
#             pass
#         else:
#             df_metadata.drop(i, inplace=True)
            
#     return df_metadata
            

In [47]:
# df_strongmatch = strong_match(df_metadata_simplified)
# df_strongmatch.to_csv('test3.csv')

# Remove duplicates using text Cosine distance
--- to be implemented later

In [33]:
"""
#cosine.distance of 0 means no difference in text
#therefore our threshold will be cosine.distance > value
#cosine.distance(text1,text2)

from textdistance import cosine

def remove_duplicate_text(df_metadata):
    
    #unique_metadata_field = list(df_metadata.Metadata_Field.unique())
    unique_metadata_field = ['breach']
    
    for mdf in unique_metadata_field:
        
        df_single_term = df_metadata[df_metadata.Metadata_Field == mdf]
        
    
        
        
    
    return df_single_term
"""    