# Incorporate spacy NER and/or dependency parser

PERSON	People, including fictional.  
NORP	Nationalities or religious or political groups.  
FAC	Buildings, airports, highways, bridges, etc.  
ORG	Companies, agencies, institutions, etc.  
GPE	Countries, cities, states.  
LOC	Non-GPE locations, mountain ranges, bodies of water.  
PRODUCT	Objects, vehicles, foods, etc. (Not services.)  
EVENT	Named hurricanes, battles, wars, sports events, etc.  
WORK_OF_ART	Titles of books, songs, etc.  
LAW	Named documents made into laws.  
LANGUAGE	Any named language.  
DATE	Absolute or relative dates or periods.  
TIME	Times smaller than a day.  
PERCENT	Percentage, including ”%“.  
MONEY	Monetary values, including unit.  
QUANTITY	Measurements, as of weight or distance.  
ORDINAL	“first”, “second”, etc.  
CARDINAL	Numerals that do not fall under another type.  

In [14]:
import pandas as pd
from os import listdir
from os.path import isfile, join


#create contract list
contract_path = 'C:\\Users\\Hello\\Desktop\\metric_stream\\v0.024\\metadata_pipeline\\results\\'
contract_names = listdir(contract_path)


contract_files = [contract_path + f for f in listdir(contract_path) if isfile(join(contract_path, f))]
file_name_list = [f for f in listdir(contract_path)]
contract_number = 1

file_name_list[contract_number]

'MetricStream - Chubb Renewal Order Form vJan-8-2020.pdf_sentences.csv'

In [15]:
#contract number in list. 
current_contract_file = contract_files[contract_number]
current_file = file_name_list[contract_number]


df_metadata = pd.read_csv(current_contract_file)
#df_metadata.head()

In [16]:
df0 = df_metadata[['Metadata_Category', 'Metadata_Original', 'Metadata_Matches', 'Secondary_Context']]
df0.columns = ['Category', 'Original Metadata', 'Metadata Match', 'Context']
df0.head()

Unnamed: 0,Category,Original Metadata,Metadata Match,Context
0,['Agreement'],['Agreement Type'],['Amendment'],Printed Name: ________________________ Prin...
1,['Agreement'],['Type of Support'],['Type of Support'],Printed Name: ________________________ Prin...
2,['Date'],['Effective Date'],['Effective Date'],Printed Name: ________________________ Prin...
3,['Date'],['Effective Date'],['Effective Date'],Printed Name: ________________________ Prin...
4,['Date'],['Effective Date'],['Effective Date'],Printed Name: ________________________ Prin...


In [17]:
#takes full metadata excel file and grabs a few categories for display.
#can add or remove columns as needed

def clean_df(df_metadata):
    
    #grab relevant columns
    df0 = df_metadata[['Metadata_Category', 'Metadata_Original', 'Metadata_Matches', 'Secondary_Context']]
    
    #rename for clarity
    df0.columns = ['Category', 'Original Metadata', 'Metadata Match', 'Context']
    
    categories = list(df0['Category'].unique())

    df_clean = df0

    for i,j in df_clean.iterrows():
        metadata_cat_clean = j['Category'].replace('[', '').replace(']', '').replace("'", "")
        metadata_original_clean = j['Original Metadata'].replace('[', '').replace(']', '').replace("'", "")
        metadata_match_clean = j['Metadata Match'].replace('[', '').replace(']', '').replace("'", "")


        df_clean.at[i, 'Category'] = metadata_cat_clean
        df_clean.at[i, 'Original Metadata'] = metadata_original_clean
        df_clean.at[i, 'Metadata Match'] = metadata_match_clean
        
    return df_clean

In [18]:
# core model
# text and entity labels
import spacy


#association dictionary of spacy NER to metadata category
dic_category_entity = {'Fee': 'MONEY', 'Time':'DATE', 'Person': 'PERSON', 'Product': 'PRODUCT', \
                       'Agreement': 'LAW', 'Financial': 'MONEY', 'Address': 'GPE'}



#grab entities from context based on category
def entity_extraction(df, entity_category_dic):
    
    nlp1 = spacy.load('en_core_web_sm')
    entity_list_outer = []

    for i,j in df.iterrows():
        
        entity_list = []
        
        if j['Category'] in entity_category_dic:
            
            text1 = nlp1(j['Context'])
            
            for i in text1.ents:
                if i.label_ == entity_category_dic[j['Category']]:
                    entity_list.append(i.text)
                    
        entity_list_outer.append(entity_list)
      
    df['Possible Matches'] = entity_list_outer   
    
    #df = df[df['Entities'].map(lambda d: len(d)) > 0]
    
    return df

In [19]:
#Run cleaner, entity extraction, and write to file

df_clean = clean_df(df_metadata)
k1 = entity_extraction(df_clean, dic_category_entity)
k1.to_csv(f'ner_results//out_{current_file}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import spacy

df = pd.read_csv(r'C:\Users\Hello\Desktop\metric_stream\v0.024\metadata_pipeline\results\out_MetricStream - Chubb Renewal Order Form vJan-8-2020.pdf.csv')
df = df.iloc[:,1:]

In [15]:
df.head()

Unnamed: 0,Category,Original Metadata,Metadata Match,Context,Possible Matches
0,Agreement,Agreement Type,Amendment,Printed Name: ________________________ Prin...,[]
1,Agreement,Type of Support,Type of Support,Printed Name: ________________________ Prin...,[]
2,Date,Effective Date,Effective Date,Printed Name: ________________________ Prin...,[]
3,Date,Effective Date,Effective Date,Printed Name: ________________________ Prin...,[]
4,Date,Term End Date,Term End Date,Printed Name: ________________________ Prin...,[]


In [13]:
df['Context'][2]

'   Printed Name: ________________________ Printed Name: ______________________________  Title: _______________________________ Title: _____________________________________  Signature: ___________________________ Signature: _________________________________  Date: _______________________________ Date: _____________________________________   DocuSign Envelope ID: 9DDE25DE-875D-496A-9C65-FDBB797ADDDBJanuary 20, 2020Sarah SlatterySVP FinanceRand GreenblattJanuary 21, 2020Salil JainJanuary 21, 2020Chief Client Officer\tEXHIBIT B-11\tOrder Form for MetricStream Software License\t\t2020-01-21T17:46:42-0800\tDigitally verifiable PDF exported from www.docusign.comSOFTWARE, SUPPORT AND SERVICES AGREEMENT Page 1 of 4 EXHIBIT B-11   Order Form for MetricStream Software License  Exhibit to the Software License, Support and Services Agreement and Amendment 1 to the Software License, Support and Services Agreement (“Agreement”) Between  MetricStream, Inc. (“Company”) and Chubb INA Holdings Inc., (f/

In [17]:
nlp0 = spacy.load('en_core_web_sm')
text0 = nlp0(df.Context[2])
for i in text0.ents:
    print(i, i.label_)

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'label_'

In [None]:
import spacy

dic_category_entity = {'Fee': 'MONEY', 'Time':'DATE', 'Person': 'PERSON', 'Product': 'PRODUCT', \
                       'Agreement': 'LAW', 'Financial': 'MONEY', 'Address': 'GPE'}

def entity_extraction(df, entity_category_dic):
    
    nlp1 = spacy.load('en_core_web_sm')
    entity_list_outer = []

    for i,j in df.iterrows():
        
        entity_list = []
        
        if j['Category'] in entity_category_dic:
            
            text1 = nlp1(j['Context'])
            
            for i in text1.ents:
                if i.label_ == entity_category_dic[j['Category']]:
                    entity_list.append(i.text)
                    
        entity_list_outer.append(entity_list)
      
    df['Possible Matches'] = entity_list_outer   
    
    #df = df[df['Entities'].map(lambda d: len(d)) > 0]
    
    return df