In [1]:
#need to pip install tika
from tika import parser
import pandas as pd
import numpy as np

# Create Metadata Term Dictionary

In [2]:
#Return true if our current value is NaN

def isNan(a):
    return a != a

#generate dictionary 
#key is metadata field
#value is a list of term1, term2, term3
# we use this to match metadata fields to found terms in our sentence output document

def metadata_term_library(df_metadata):
    
    dic_metadata_terms = {}
    dic_metadata_original = {}
    dic_metadata_category = {}
    dic_metadata_subcategory = {}
    
    for i,j in df_metadata.iterrows():
        
        dic_metadata_original[j['Metadata']] = [j['Metadata_Original']]
        dic_metadata_category[j['Metadata']] = [j['Category']]
        dic_metadata_subcategory[j['Metadata']] = [j['Sub_Category']]

        dic_metadata_terms[j['Metadata']] = [j['Term1']]
                
        if not isNan(j['Term2']):
            dic_metadata_terms[j['Metadata']].append(j['Term2'])
        else:
            pass
        
        if not isNan(j['Term3']):
            dic_metadata_terms[j['Metadata']].append(j['Term3'])
        else:
            pass
        
    return(dic_metadata_terms, dic_metadata_original, dic_metadata_category, dic_metadata_subcategory)
          

# Simplified data structure for Term search

In [3]:
import spacy
from spacy import displacy
import en_core_web_sm
from spacy import displacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler 

def sent_segment(txt):
    """ sentence tokenization

    Parameters:
    txt : tex to tokenize into sentences
    Returns: list of sentences

    """

    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = English() 

    # A simple pipeline component, to allow custom sentence boundary detection logic 
    # that doesn’t require the dependency parse. It splits on punctuation by default
    sbd = nlp.create_pipe('sentencizer')

    # Add the component to the pipeline
    nlp.add_pipe(sbd)

    #nlp is used to create documents with linguistic annotations.
    doc = nlp(txt)   

    # create list of sentence tokens
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)

    return sents_list 

In [4]:
#synonym finder

import pandas as pd
from nltk.corpus import wordnet
from itertools import chain

# #find synonyms based on the keyword and build into a dataframe
def get_synonyms(term_dic):

    #list of tuples containing (category, original term, synonym)
    term_tuples = []

    
    ####Issues passing number into wordnet. 
    L = []
    for key in term_dic:

        syn = wordnet.synsets(str(term_dic[key][0]))

            
        syn_list = [term_dic[key][0]]

        #flatten all lists by chain, remove duplicates by set
        lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))

        for i in lemmas[:3]:
            syn_list.append(i)

        syn_list = list(set(syn_list))

        for syn in syn_list:
            L.append((key, term_dic[key][0], syn))

            
    return (pd.DataFrame(L, columns=['Metadata_Field','Original_Term','Synonym_Term']))    


In [5]:
# #dataframe containing Metadata field, original term1, and the synonym term1

# df_term1 = get_synonyms(dic_meta_term)

In [6]:
# search proper keyword in the list of sentences
# if found, grab surrounding lines, combine, and add to 3rd dataframe

#sentence_df contains list of sentence segment in each row
#term_df contains the category, original term, and synonym term in each row
# Want to search the synonym term in sentence rows. Then if match found, add all those 4 columns to new dataframe


def search_term(sentence_df, term_df, doc_name):
    
    category_column_names = ['Document Name','Metadata_Field','Original_Keyword', 'Searched_Keyword', 'Sentence_Number','Text']
    df_metadata = pd.DataFrame(columns = category_column_names)

    array_metadata = df_metadata.values
    
    for i,j in sentence_df.iterrows():
        for m,n in term_df.iterrows():
            if str(n['Synonym_Term']) in j['Sentences'].lower():
                ####### do the iteration, search the term and add to dataframe
                
                df_found = pd.DataFrame([(doc_name, n['Metadata_Field'],n['Original_Term'], n['Synonym_Term'], i, j['Sentences'])])
                
                array_metadata = np.concatenate(\
                        (array_metadata, df_found.values), axis=0)

    df_metadata = pd.DataFrame(array_metadata, columns = category_column_names)
    
    return df_metadata


#currently adding things to an np.array continually. then at the end, add it to the data frame


In [7]:
#create context. takes current line of text and associated index. 
# lokos at the sentence dataframe and grabs correct rows.
# allows us to search multiple segments for each original term

#pass the original sentence dataframe and current row of the metadata dataframe
def create_context(sentence_df, metadata_df, idx):
    
    sentence_idx = metadata_df.iloc[idx]['Sentence_Number']
    
    try:
        context = sentence_df.iloc[sentence_idx - 1]['Sentences'] + \
        sentence_df.iloc[sentence_idx]['Sentences'] + \
        sentence_df.iloc[sentence_idx + 1]['Sentences']
    except:
        
        context = sentence_df.iloc[sentence_idx]['Sentences']
    
    return context

In [8]:
#Uses a secondary library of terms
# Keys in this library will match the 'original keyword' column in the metadata_df
# Iterate through associated list of terms
# If secondary term found, add it to new column
# adds to an existing dataframe with enough columns so just need to generate list and add to new column

#Need to add greater context to each line. grab sentence from sentence dataframe

def search_secondary(sentence_df, metadata_df, term_dic):
    
    secondary_term_outer = []
    secondary_syn_outer = []
    secondary_context_outer = []
    
    for i,j in metadata_df.iterrows():
        
        context = create_context(sentence_df, metadata_df, i)
        
        secondary_term_inner = [] 
        secondary_syn_inner = []
        secondary_context_inner = None
        term1 = term_dic[j['Metadata_Field']]
        
    
        if len(term1) > 1:

            for k in range(1,len(term1)):

                item = str(term1[k])

                syn = wordnet.synsets(item)

                syn_list = [item]

                #flatten all lists by chain, remove duplicates by set
                lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))                

                for m in lemmas[:3]:
                    syn_list.append(m)

                syn_list = list(set(syn_list))

                for syn in syn_list:

                    if syn in context.lower() and secondary_context_inner == None:
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)
                        secondary_context_inner = context

                    elif syn in context.lower():
                        secondary_term_inner.append(item)
                        secondary_syn_inner.append(syn)

                    else:
                        pass
        else:
            secondary_context_inner = context


                                      
        secondary_term_outer.append(secondary_term_inner)
        secondary_syn_outer.append(secondary_syn_inner)
        secondary_context_outer.append(secondary_context_inner)
    
    metadata_df['Secondary_Term'] = secondary_term_outer
    metadata_df['Secondary_Synonym'] = secondary_syn_outer
    metadata_df['Secondary_Context'] = secondary_context_outer
    
    return metadata_df

In [9]:
#inputs metadata dictionary
#sentence found keyword dataframe
#iterate through sentence dataframe
# look at the combined terminology list of original_keyword and seondary_term
# see if metadata search temrs are contained in the list
# if all terms are in the sentence term list, then add the associated metadata term to the row as possible match

def combine_keyword_list(df_keyword):
    
    complete_term_list = []
    
    for i,j in df_keyword.iterrows():
        secondary_term_tolist = eval(str(j['Secondary_Term']))
        original_keyword_string = str(j['Original_Keyword'])
        
        #print(type(original_keyword_string))
        
        #print(secondary_term_tolist)
        secondary_term_tolist.append(original_keyword_string)
        complete_terms = secondary_term_tolist
        
        complete_term_list.append(complete_terms)
        
    df_keyword['Complete_Term_List'] = complete_term_list
    
    return df_keyword

##test
#combine_keyword_list(df_sentence_keywords)

In [10]:
# check each row to see if metadata fields are in the keywords list
# create new column in dataframe that shows all possible metadata matches

def metadata_match(df_keyword, dic_metadata, dic_original, dic_cat, dic_subcat):
    
    metadata_matches_outer = []
    metadata_original_match_outer = []
    metadata_category_outer = []
    metadata_subcategory_outer = []
    
    
    for i,j in df_keyword.iterrows():
        
        metadata_matches_inner = []
        original_match_inner = []
        category_match_inner = []
        subcategory_match_inner = []

                   
        term_list = j['Complete_Term_List']
        
        for key in dic_metadata:
            
            if all(elem in term_list for elem in dic_metadata[key]):
                metadata_matches_inner.append(key)
                original_match_inner.append(dic_original[key][0])
                category_match_inner.append(dic_cat[key][0])
                subcategory_match_inner.append(dic_subcat[key][0])
                                   
            else:
                pass
            
        metadata_matches_outer.append(metadata_matches_inner)
        metadata_original_match_outer.append(list(set(original_match_inner)))
        metadata_category_outer.append(list(set(category_match_inner)))
        metadata_subcategory_outer.append(list(set(subcategory_match_inner)))
    
    df_keyword['Metadata_Matches'] = metadata_matches_outer
    df_keyword['Metadata_Original'] = metadata_original_match_outer
    df_keyword['Metadata_Category'] = metadata_category_outer
    df_keyword['Metadata_Subcategory'] = metadata_subcategory_outer
    
    #remove lines with know metadata field match
    df_keyword = df_keyword[df_keyword['Metadata_Matches'].map(lambda d: len(d)) > 0]
    
    #print(df_keyword['Metadata_Matches'][1])
    return df_keyword
    

# Clean Data and NER functions

In [11]:
import spacy

#takes full metadata excel file and grabs a few categories for display.
#can add or remove columns as needed

def clean_df(df_metadata):
    df_metadata.head()
    #grab relevant columns
    df0 = df_metadata[['Document Name','Metadata_Category', 'Metadata_Original', 'Metadata_Matches', 'Secondary_Context']]
    
    
    #rename for clarity
    df0.columns = ['Document Name','Category', 'Original Metadata', 'Metadata Match', 'Context']
    
    
    #categories = list(df0['Category'].unique())

    df_clean = df0

    for i,j in df_clean.iterrows():
        metadata_cat_clean = str(j['Category']).replace('[', '').replace(']', '').replace("'", "")
        metadata_original_clean = str(j['Original Metadata']).replace('[', '').replace(']', '').replace("'", "")
        metadata_match_clean = str(j['Metadata Match']).replace('[', '').replace(']', '').replace("'", "")


        df_clean.at[i, 'Category'] = metadata_cat_clean
        df_clean.at[i, 'Original Metadata'] = metadata_original_clean
        df_clean.at[i, 'Metadata Match'] = metadata_match_clean
        
    return df_clean


#grab entities from context based on category
def entity_extraction(df, entity_category_dic):
    
    #nlp1 = spacy.load('en_core_web_sm')
    nlp1 = spacy.load('en_core_web_lg')
    entity_list_outer = []

    for i,j in df.iterrows():
        
        entity_list = []
        
        if j['Category'] in entity_category_dic:
            
            text1 = nlp1(j['Context'])
            
            for k in text1.ents:
                if k.label_ in entity_category_dic[j['Category']]:
                    entity_list.append(k.text)
        
        #identify uniques
        entity_list = list(set(entity_list))
        entity_list_outer.append(entity_list)
      
    df['Possible Matches'] = entity_list_outer   
    
    #df = df[df['Entities'].map(lambda d: len(d)) > 0]
    
    return df

# Process
Start with input dictionary containing category and keyword.  
Use keyword to search synonyms. Create a tuple of (category, keyword, synonym) and append those to a list.  
Iterate through list and search for synonym in text of df_sentence.  
Create a dataframe of ['category', 'keyword', 'synonym', 'text']  


Best algorithm for this? Right now looking at O(n_squared)
    - (number of terms) * (number of sentences)
    

In [12]:
from os import listdir
from os.path import isfile, join

#create contract list
contract_path = 'C:\\Users\\Hello\\Desktop\\metric_stream\\v0.024\\metadata_pipeline\\data\\'
contract_names = listdir(contract_path)
#remove .pdf Use for file name in csv export at end

contract_files = [contract_path + f for f in listdir(contract_path) if isfile(join(contract_path, f))]
file_name_list = [f for f in listdir(contract_path)]
contract_number = 2

file_name_list[contract_number]

'FHLB_Chicago_2016_Renewal.pdf'

In [13]:
#can loop through list for contract files
parsed_contract = parser.from_file(contract_files[contract_number])
current_file_name = file_name_list[contract_number]

pdf_doc = parsed_contract['content']


#print(contract_files)

# Generate sentence list

In [14]:
#dic_meta_term is list of terminology associated with metadata field
#dic_meta_category is mapping of metadata field to a metadata category (original metadata field provided to us)

metadata_file = pd.read_excel('C:\\Users\\Hello\\Desktop\\metric_stream\\v0.024\\metadata_pipeline\\metadata_documents\\metadata_library.xlsx')
dic_meta_term, dic_meta_original, dic_meta_cat, dic_meta_subcat = metadata_term_library(metadata_file)

In [15]:
#Remove newlines
#Remove sentence that are too short (usually 1 or 2 symbols)

sentence_list = sent_segment(pdf_doc)
sentence_list_clean = []

for i in range(0,len(sentence_list)):
    remove_newline = sentence_list[i].replace('\n', '')
    if len(sentence_list[i]) > 4:
        sentence_list_clean.append(remove_newline)
    else: pass
    
#Write sentence list dataframe to file

dic_sentence = {'Sentences' : sentence_list_clean} 
df_sentence = pd.DataFrame.from_dict(dic_sentence)

#df_sentence.head()
#df_sentence.to_csv('sentences0615.csv')

# Match sentences with Metadata terminology

In [16]:
#dataframe containing Metadata field, original term1, and the synonym term1
df_words = get_synonyms(dic_meta_term)


df_metadata0 = search_term(df_sentence, df_words, file_name_list[contract_number])


df_metadata_2 = search_secondary(df_sentence, df_metadata0, dic_meta_term)

#df_metadata_2.to_csv('test2.csv')

# Match found metadata terms to original metadata field and category

In [17]:
df_combined_terms = combine_keyword_list(df_metadata_2)

In [18]:
### MAtch score (maybe)-> number of fields picked up (minimum 50% or something)

df_metadata_match = metadata_match(df_combined_terms, dic_meta_term, dic_meta_original, dic_meta_cat, dic_meta_subcat)
#df_metadata_match.head()
#df_metadata_match.to_csv(f'results//{current_file_name}_sentences.csv')

# Named Entity Recognition

PERSON	People, including fictional.  
NORP	Nationalities or religious or political groups.  
FAC	Buildings, airports, highways, bridges, etc.  
ORG	Companies, agencies, institutions, etc.  
GPE	Countries, cities, states.  
LOC	Non-GPE locations, mountain ranges, bodies of water.  
PRODUCT	Objects, vehicles, foods, etc. (Not services.)  
EVENT	Named hurricanes, battles, wars, sports events, etc.  
WORK_OF_ART	Titles of books, songs, etc.  
LAW	Named documents made into laws.  
LANGUAGE	Any named language.  
DATE	Absolute or relative dates or periods.  
TIME	Times smaller than a day.  
PERCENT	Percentage, including ”%“.  
MONEY	Monetary values, including unit.  
QUANTITY	Measurements, as of weight or distance.  
ORDINAL	“first”, “second”, etc.  
CARDINAL	Numerals that do not fall under another type.  

In [19]:
#association dictionary of spacy NER to metadata category
dic_category_entity = {'Fee': ['MONEY'], 'Date':['DATE'], 'Person': ['PERSON'], 'Product': ['PRODUCT', 'QUANTITY'], \
                       'Agreement': ['LAW'], 'Financial': ['MONEY'], 'Fee': ['MONEY'], 'Address': ['GPE'],\
                      'Legal': ['LAW']}


In [20]:
#Run cleaner, entity extraction, and write to file
df_clean = clean_df(df_metadata_match)

df_ner_output = entity_extraction(df_clean, dic_category_entity)
df_ner_output.to_csv(f'results//out_{file_name_list[contract_number]}.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
