In [21]:
### importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [22]:
# initializing NLTK and stop words

import nltk
from nltk.corpus import stopwords
from nltk.tag.stanford import StanfordNERTagger
stop = stopwords.words('english')

In [3]:
### importing NLTK features for NER
from nltk import word_tokenize, pos_tag, ne_chunk

In [301]:
### opening and reading the desired file
file_name = '857737_2005-03-11_EMPLOYMENT AGREEMENT - WARREN CLAMEN.txt'


doc = open("document-analytics-master/document-analytics-master/employment contracts/"+file_name, "r", encoding="utf8")
doc = doc.read()

In [302]:
### removing new line characters and extra spaces

doc = doc.replace('\n', ' ').replace('\r', '')
doc = ' '.join(doc.split())
print(doc)

Exhibit 10.1 EMPLOYMENT AGREEMENT Agreement, dated as of February 14, 2005, by and between Candie's, Inc. (the "Company") and Warren Clamen ("Employee") (the "Parties"). WHEREAS, the Company wishes to hire the Employee, for the position of Executive Vice President, and the Employee has agreed to undertake and perform the obligations set forth in this Agreement, subject to the terms hereof. NOW, THEREFORE, in consideration of the promises, covenants and agreements set forth in this Agreement, the parties agree as follows: 1. Engagement of Employee; Duties. The Company hereby agrees to hire the Employee, on an exclusive basis, as Chief Financial Officer of the Company to perform the services mutually agreed to by the Parties and customary of a chief financial officer of a public company. Employee shall be an executive officer of the Company and shall report to the Chief Executive Officer of the Company. 2. Time. Employee shall devote substantially all of his professional time and best ef

In [304]:
tokenized_doc = nltk.word_tokenize(doc)
 
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = nltk.pos_tag(tokenized_doc)
ne_chunked_sents = nltk.ne_chunk(tagged_sentences)
 
# extract all named entities
named_entities = []
for tagged_tree in ne_chunked_sents:
    if hasattr(tagged_tree, 'label'):
        entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
        entity_type = tagged_tree.label() # get NE category
        named_entities.append((entity_name, entity_type))
print(named_entities)

[('EMPLOYMENT', 'ORGANIZATION'), ('Candie', 'GPE'), ('Inc.', 'GPE'), ('Warren Clamen', 'PERSON'), ('Company', 'ORGANIZATION'), ('Employee', 'ORGANIZATION'), ('Executive', 'ORGANIZATION'), ('Employee', 'ORGANIZATION'), ('THEREFORE', 'ORGANIZATION'), ('Employee', 'GPE'), ('Company', 'ORGANIZATION'), ('Employee', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Parties', 'ORGANIZATION'), ('Employee', 'PERSON'), ('Company', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Employee', 'PERSON'), ('Company', 'ORGANIZATION'), ('Term', 'PERSON'), ('Employee', 'ORGANIZATION'), ('Start Date', 'PERSON'), ('Employee', 'PERSON'), ('Company', 'ORGANIZATION'), ('Employee', 'PERSON'), ('Employee', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Term', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Fringe Benefits', 'PERSON'), ('Employee', 'PERSON'), ('Company', 'ORGANIZATION'), ('Employee', 'PERSON'), ('Company', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Company', 'ORG

In [275]:
### recognizing PERSONs from NLTK entities

per_nltk = []
for i in named_entities:
    if i[1]=='PERSON':
        if i[0] not in per_nltk:
            per_nltk.append(i[0])

print(per_nltk)

['Warren Clamen', 'Employee', 'Term', 'Start Date', 'Fringe Benefits', 'Bonus', 'Neil Cole', 'Entire Agreement', 'Law', 'Neil Cole Title']


In [288]:
### lowering them
per_nltk = [x.lower() for x in per_nltk]
for i in per_nltk:
    if i.startswith('emp') or i.startswith('agr'):
        per_nltk.remove(i)

print(per_nltk)

['warren clamen', 'term', 'start date', 'fringe benefits', 'bonus', 'neil cole', 'entire agreement', 'law', 'neil cole title']


In [274]:
### recognizing ORGANIZATIONS from NLTK entities
org_nltk = []
for i in named_entities:
    if i[1]=='ORGANIZATION':
        if i[0] not in org_nltk:
            org_nltk.append(i[0])

org_nltk = [x.lower() for x in org_nltk]
for i in org_nltk:
    if i.startswith('emp') or i.startswith('agr'):  ### we'll ignore words with emp or agr as they're not useful.
        org_nltk.remove(i)

print(org_nltk)

['company', 'executive', 'therefore', 'parties', 'term', 'cobra', 'control', 'board', 'supreme court', 'county', 'district court', 'candie', 'inc', 'ceo']


## NOW USING SPACY FOR NER

In [245]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
import re

In [277]:
doc = open("document-analytics-master/document-analytics-master/employment contracts/"+file_name, "r", encoding="utf8")

doc = doc.read()

In [278]:
doc = doc.replace('\n', ' ').replace('\r', '')
doc = ' '.join(doc.split())
print(doc)

Exhibit 10.1 EMPLOYMENT AGREEMENT Agreement, dated as of February 14, 2005, by and between Candie's, Inc. (the "Company") and Warren Clamen ("Employee") (the "Parties"). WHEREAS, the Company wishes to hire the Employee, for the position of Executive Vice President, and the Employee has agreed to undertake and perform the obligations set forth in this Agreement, subject to the terms hereof. NOW, THEREFORE, in consideration of the promises, covenants and agreements set forth in this Agreement, the parties agree as follows: 1. Engagement of Employee; Duties. The Company hereby agrees to hire the Employee, on an exclusive basis, as Chief Financial Officer of the Company to perform the services mutually agreed to by the Parties and customary of a chief financial officer of a public company. Employee shall be an executive officer of the Company and shall report to the Chief Executive Officer of the Company. 2. Time. Employee shall devote substantially all of his professional time and best ef

In [308]:
### NOW using SPACY for NER

from spacy import displacy
 
doc = nlp(doc)
displacy.render(doc, style='ent', jupyter=True)

In [309]:
### checking the document type

amendment = ['Amendment to Employment Agreement','AMENDMENT', 'AMENDMENT TO EMPLOYMENT AGREEMENT']
doc_type = "Employment Agreement"
for sent in doc.sents:
    for i in amendment:
        if i in str(sent):
            doc_type = "AMENDMENT"
            break
print (doc_type)

Employment Agreement


In [310]:
print([(X.text, X.label_) for X in doc.ents])

[('EMPLOYMENT AGREEMENT Agreement', 'PERSON'), ('February 14, 2005', 'DATE'), ('Candie', 'PERSON'), ('Inc.', 'GPE'), ('the "Company"', 'ORG'), ('Warren Clamen', 'PERSON'), ('Company', 'ORG'), ('Employee', 'ORG'), ('1', 'CARDINAL'), ('Duties', 'DATE'), ('Employee', 'ORG'), ('Company', 'ORG'), ('Company', 'ORG'), ('2', 'CARDINAL'), ('Company', 'ORG'), ('3', 'CARDINAL'), ('Employee', 'ORG'), ('March 9, 2005', 'DATE'), ('the "Start Date"', 'WORK_OF_ART'), ('two years', 'DATE'), ('Party', 'ORG'), ('30 days', 'DATE'), ('one', 'CARDINAL'), ('4', 'CARDINAL'), ('the Employee $', 'ORG'), ('225,000', 'MONEY'), ('the first year of the Term', 'DATE'), ('no less than $240,000', 'MONEY'), ('the second year', 'DATE'), ('Company', 'ORG'), ('1,500', 'MONEY'), ('6', 'CARDINAL'), ('5', 'CARDINAL'), ('401(K', 'CARDINAL'), ('Company', 'ORG'), ('the Company', 'ORG'), ('6', 'CARDINAL'), ('Bonus', 'FAC'), ('up to 100%', 'PERCENT'), ('Company', 'ORG'), ('7', 'CARDINAL'), ('200,000', 'CARDINAL'), ('Company', 'OR

In [311]:
### recognizing ORGANIZATIONS and PERSONS by SPACY

org= []
for ent in doc.ents:
    if ent.label_=='ORG':
        if ent.text not in org:
            org.append(ent.text)
print (org,'\n')

per= []
for ent in doc.ents:
    if ent.label_=='PERSON':
        if ent.text not in per:
            per.append(ent.text)
print (per)

['the "Company"', 'Company', 'Employee', 'Party', 'the Employee $', 'the Company', 'Control', "the Company's", 'Board of Directors', 'the State of New York', 'The Company and Employee', 'the New York State Supreme Court', 'the United States District Court for Southern District', 'WHEREOF', 'INC'] 

['EMPLOYMENT AGREEMENT Agreement', 'Candie', 'Warren Clamen', 'Vacation', 'Neil Cole', 'Governing Law', 'Neil Cole Name', 'Neil Cole Title', 'Warren']


In [312]:
per = [x.lower() for x in per]
for i in per:
    if i.startswith('emp'):
        per.remove(i)
print (per)

['candie', 'warren clamen', 'vacation', 'neil cole', 'governing law', 'neil cole name', 'neil cole title', 'warren']


In [313]:
### finding the common PERSONS recognized by both NLTK and SPACY
lst3 = [value for value in per if value in per_nltk] 
print(lst3)

['warren clamen', 'neil cole', 'neil cole title']


In [290]:
### Name of the employee
employee = lst3[0]
employee = employee.upper()
print (employee)

WARREN CLAMEN


In [291]:
### Finding the employer name.

employee_first = employee.lower().split()[0]
#employee_second = employee.lower().split()[1]

#for i in lst3:
#    if i.startswith(employee_first) or i.startswith(employee_second):
#        lst3.remove(i)
#print (lst3)
#employer = lst3[-1]    ### We take -1 because the letter always end with the signature of the employer, so there is good chance of recognizing the employer name at the end.
#employer = employer.upper()
employer = per_nltk[-1].upper()
employer

'NEIL COLE TITLE'

In [292]:
org = [x.lower() for x in org]
for i in org:
    if i.startswith('emp') or i.startswith('agr'):
        org.remove(i)

print(org)

['the "company"', 'company', 'party', 'the employee $', 'the company', 'control', "the company's", 'board of directors', 'the state of new york', 'the company and employee', 'the new york state supreme court', 'the united states district court for southern district', 'whereof', 'inc']


In [293]:
### Finding the common organizations recognized by both NLTK and Spacy.

org = [value for value in org if value in org_nltk]
print (org)

['company', 'control', 'inc']


In [294]:
### Finding the job profile of the employee being hired.
roles = ['Director of Operations', 'President and Chief Executive Officer', 'Chief  Executive  Officer','Senior Vice President','Vice President', 'Chief Financial Officer','Chief Human Resources Officer','Chairman of the Board of Directors','Chief Supply Chain Officer','Chief Creative Officer','Chief Procurement Officer','Contract CFO','Senior Engineer','Co-Chief Investment Officer']
role = ''
for sent in doc.sents:
    for i in roles:
        if i in str(sent):
            if i not in role:
                role = i+ ', '+ role
print (role)

Chief Financial Officer, Vice President, 


In [295]:
### Calculating the base salary per annum. We are taking the maximum here because yearly salary is always higher than monthly salary.
sal_list = []
for sent in doc.sents:
    sent = str(sent).split()
    for ind in sent:
        if "$" in ind:
            sal_list.append(ind)

base_sal = max(sal_list)
    
print (base_sal)

$240,000


In [296]:
### Creating the pandas DataFrame with the data so obtained.
data = ({'file':[doc_type],'Employer Name':[employer],'Employee Name':[employee], 'Role/Tile of the employee':[role], 'Base Salary(yearly)':[base_sal]})
ner = pd.DataFrame(data)

In [297]:
print (ner)

  Base Salary(yearly)  Employee Name    Employer Name  \
0            $240,000  WARREN CLAMEN  NEIL COLE TITLE   

                   Role/Tile of the employee                  file  
0  Chief Financial Officer, Vice President,   Employment Agreement  


In [298]:
### making the csv

ner.to_csv(r'letter.csv')

### FEATURE EXTRACTION- MD SAMEEM ALI