<a href="https://colab.research.google.com/github/RichardMWarburton/ExploringCUAD/blob/Dev/Date%20Parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Individual Clause Investigation (Agreement Date)




## The Data

CUAD: An Expert-Annotated NLP Dataset for Legal Contract Review

https://arxiv.org/abs/2103.06268

This code is an adaptation of the scrape.py file avaliable on the github repository for CUAD.  It has been adapted to run in Jypter notebooks and allow us to step throght the coding line by line.`

## 1: Import Packages & Define Useful Functions

In [128]:
from zipfile import ZipFile
import json
import os
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import re
from random import sample, choice
import numpy as np
import pandas as pd
import re
import string
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.cluster import AgglomerativeClustering
from pprint import pprint

!pip install dateparser
import dateparser



In [129]:
def extract_zip(pth,data_pth = None):
    """Function to extract contents of a zip file to a specified location (wd if data_pth not passed)"""
    with ZipFile(pth, 'r') as zipObj:
       # Extract all the contents of zip file in different directory
       zipObj.extractall(data_pth)

## 2: Download repository and extract data

In [130]:
#Download CUAD git repository
if not os.path.exists('main.zip'):
  !wget --no-check-certificate https://github.com/TheAtticusProject/cuad/archive/refs/heads/main.zip
  !unzip -q main.zip

#If it has not already been extracted, extract the contents of data.zip
if not os.path.exists('cuad-main/data'):
  os.makedirs('cuad-main/data')

if not os.path.exists('cuad-main/data/CUADv1.json'):
  extract_zip('cuad-main/data.zip','cuad-main/data/')

#Download a manualy curated set of labels for the full CUAD data. 
if not os.path.exists('labels3.txt'):
  !wget https://raw.githubusercontent.com/RichardMWarburton/ExploringCUAD/main/labels3.txt

In [131]:
#Load CUADv1 JSON to data
with open('cuad-main/data/CUADv1.json','r') as infile:
    for line in infile:
        contract_data = json.loads(line)

### 2.1: Read in Label Data & Generate Look Up Dictionary

In [132]:
#Initate storage for labels look up (LU)
labels_LU = {}

#Read in labels data
with open('labels3.txt','r',encoding ='UTF-8') as infile:
  for line in infile:
    #Remove trailing special characters and split on tab
    data = line.strip().split(sep='\t')
    #Add name and label to labels_LU dictionary
    labels_LU[data[0]] = data[1]

The look up returns one error, most likely due to the accented E and a disparity of encoding.  This will be forced to 'Marketing Agreement' manually for now (EITHER SORT OR PROVIDE EXAMPLE)

### 2.2: Extract Raw Contract Data

In [133]:
#ser reg ex expression for characters to remove from contract contest
spec_chars = '\\n|\\t|\\t'

#Set number of contracts in data
num_contracts = len(contract_data['data'])

#Initate dictionary to store raw contract data
raw_contracts = defaultdict(list)

#for each contract
for i in range(num_contracts):
  #Append the title, contract text and character length of text to the raw_contracts dictionary
  raw_contracts['contract title'].append(contract_data['data'][i]['title'])
  raw_contracts['label'].append(labels_LU[contract_data['data'][i]['title']] if contract_data['data'][i]['title'] in labels_LU else 'marketing agreement' ) #<- manual error trap applied here (see below)
  
  #Parse raw text and process to remove breaks
  raw_text = contract_data['data'][i]['paragraphs'][0]['context']
  clean_text = re.sub(spec_chars,'',raw_text)

  #Split clean text in to sentances and tokens
  sentance_text = clean_text.split(sep = '. ')
  token_text = clean_text.split(sep = ' ')

  #Append text to the respective key in the raw_contracts dictionary
  raw_contracts['raw text'].append(raw_text)
  raw_contracts['clean text'].append(clean_text)
  raw_contracts['sentance text'].append(sentance_text)
  raw_contracts['token text'].append(token_text)
  
  #Add character, sentance and token counts to raw_contracts dictionary
  raw_contracts['character count'].append(len(raw_text))
  raw_contracts['sentance count'].append(len(sentance_text))
  raw_contracts['token count'].append(len(token_text))


### 2.3: Extract Clause Specific Data

In [134]:
#Define the number of clauses
num_clauses = 41

#initate dictioanry to store caluse data
clause_data = defaultdict(list)

#For each contract
for i in range(num_contracts):
  #for each clause
  for j in range(num_clauses):
    #for each found clause annotation
    for k in range(len(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'])): 
      #Add the contract title
      clause_data['contract title'].append(contract_data['data'][i]['title'])
      clause_data['label'].append(labels_LU[contract_data['data'][i]['title']] if contract_data['data'][i]['title'] in labels_LU else 'marketing agreement' )  #<- manual error trap applied here
      clause_data['clause'].append(contract_data['data'][i]['paragraphs'][0]['qas'][j]['id'].split(sep='__')[1])
      clause_data['annotation'].append(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'][k]['text'])
      clause_data['annotation start'].append(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'][k]['answer_start'])
      clause_data['annotation length'].append(len(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'][k]['text']))


In [135]:
np.unique(clause_data['clause'])

array(['Affiliate License-Licensee', 'Affiliate License-Licensor',
       'Agreement Date', 'Anti-Assignment', 'Audit Rights',
       'Cap On Liability', 'Change Of Control',
       'Competitive Restriction Exception', 'Covenant Not To Sue',
       'Document Name', 'Effective Date', 'Exclusivity',
       'Expiration Date', 'Governing Law', 'Insurance',
       'Ip Ownership Assignment', 'Irrevocable Or Perpetual License',
       'Joint Ip Ownership', 'License Grant', 'Liquidated Damages',
       'Minimum Commitment', 'Most Favored Nation',
       'No-Solicit Of Customers', 'No-Solicit Of Employees',
       'Non-Compete', 'Non-Disparagement', 'Non-Transferable License',
       'Notice Period To Terminate Renewal', 'Parties',
       'Post-Termination Services', 'Price Restrictions', 'Renewal Term',
       'Revenue/Profit Sharing', 'Rofr/Rofo/Rofn', 'Source Code Escrow',
       'Termination For Convenience', 'Third Party Beneficiary',
       'Uncapped Liability', 'Unlimited/All-You-Can-Eat

## 3: Cleaning data and extracting a single clause

In [136]:
#Initate dataframe of all clause data
clause_df = pd.DataFrame(clause_data)

#Convert to lower case
clause_df['annotation'] = clause_df['annotation'].apply(lambda x: x.lower())

#Remove any formating characters or multiple spaces and replace with a single space
clause_df['annotation'] = clause_df['annotation'].apply(lambda x: re.sub('\\t|\\r|\\n|[^\S]{2,}',' ',x))

#Remove punctuation from the string
clause_df['annotation'] = clause_df['annotation'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [137]:
#Define clause of interest
clause_of_interest = 'Agreement Date'

#Limit df to clause of interest and extract annotations of itnerest
of_interest_data = clause_df[clause_df['clause'] == clause_of_interest]
annotations_of_interest = of_interest_data['annotation'].values

#Identify where there are multiple annotations per contract
titles,counts = np.unique(of_interest_data['contract title'],return_counts =True)
dups = titles[counts >= 2]

#Output Analysis
print('There are {} contracts with \'{}\' annotations'.format(*(titles.shape[0],clause_of_interest)))
print('There are {} contracts with more than one annotation'.format(dups.shape[0]))

There are 470 contracts with 'Agreement Date' annotations
There are 6 contracts with more than one annotation


From the above we can see that: 

1.   Contracts may have multiple annotations for the same clause
2.   Not all contracts have an annotation of interest

Provisionally, we will look to concatinate all such annotations for a contract in to one string.  This will then represent all the salient points for the contract and clause in question.

In [138]:
#output duplicate annotations anc contract titles
dup_df = of_interest_data[of_interest_data['contract title'].isin(dups)][['contract title','annotation']]

#print sample of duplicate annotations
for i in dup_df.index[:8]:
  print(dup_df.loc[i,'contract title'])
  print(repr(dup_df.loc[i,'annotation']))
  #print(dup_df.loc[i,'annotation'].split(sep=' '))
  print('\n')

OASYSMOBILE,INC_07_05_2001-EX-10.17-OUTSOURCING AGREEMENT
'31 day of july 2000'


OASYSMOBILE,INC_07_05_2001-EX-10.17-OUTSOURCING AGREEMENT
'july  2000'


GULFSOUTHMEDICALSUPPLYINC_12_24_1997-EX-4-AFFILIATE AGREEMENT
'agreed to and accepted as of december 14 1997'


GULFSOUTHMEDICALSUPPLYINC_12_24_1997-EX-4-AFFILIATE AGREEMENT
'this affiliate agreement is executed as of the 14th day of december 1997'


Apollo Endosurgery - Manufacturing and Supply Agreement
'effective date shall mean december 5 2014'


Apollo Endosurgery - Manufacturing and Supply Agreement
'this agreement as of the effective date'


BEYONDCOMCORP_08_03_2000-EX-10.2-CO-HOSTING AGREEMENT
'92198'


BEYONDCOMCORP_08_03_2000-EX-10.2-CO-HOSTING AGREEMENT
'september 21 1998'




**THE ABOVE COULD BE DISPLAYED BETTER**

In [139]:
#Initate memory for annotations within contracts
combined_annotations_list = defaultdict(list)
combined_annotations_string = {}

#For each annotation of interest found in the contract, 
#append annotation to a default dict list with contract as key
for i in of_interest_data.index:
  name = of_interest_data.loc[i,['contract title']].values[0]
  annotation = of_interest_data.loc[i,['annotation']].values[0]
  combined_annotations_list[name].append(annotation)

#Produce a singel string of all annotations found in specific contracts
for key in combined_annotations_list.keys():
  combined_annotations_string[key] = ' '.join(combined_annotations_list[key])

In [140]:
#Build array of contract names and concatenated annotations
contracts = np.array(list(combined_annotations_string.keys()))
combined_annotations = np.array(list(combined_annotations_string.values()))

In [141]:
combined_annotations.shape

(470,)

In [142]:
txt = combined_annotations[0]

In [143]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [144]:
nlp = en_core_web_sm.load()

In [145]:
doc = nlp(repr(txt))
print([(X.text, X.label_) for X in doc.ents])

[("'7th day of september 1999'", 'DATE')]


In [146]:
def append_type_match(x, value_list,matches=[]):
  if x.label_ in matches:
    value_list.append(x)

In [215]:
def ordinalize_num(txt):
  for token in txt.split(sep=' '):
    try:
      if int(token) <= 31:
        ord_val = num2words(token, lang="en", to="ordinal_num")
        txt = txt.replace(token,ord_val)
    except:
      continue
  return txt

In [217]:
date_feature_list = []
not_mapped = []
nlp = en_core_web_sm.load()


for annotation in combined_annotations:


  annotation_list = []  
  
  annotation = annotation.replace('t h','th')
  annotation = annotation.replace('s t','st')
  annotation = annotation.replace('day','')
  annotation = ordinalize_num(annotation)

  doc = nlp(repr(annotation))

  for X in doc.ents:
    if X.label_ in ['DATE']:

      annotation_list.append(X.text)
    else:
      not_mapped.append(annotation)
  date_feature_list.append(' '.join(annotation_list))


In [218]:
dt_list = np.array(date_feature_list)

pass_vals = list(map(dateparser.parse,dt_list))
matched = sum(list(map(lambda x: 1 if x == None else 0,pass_vals)))
print('Pass 1: {} of {} not matched ({:.2%})'.format(*(matched,len(pass_vals),matched/len(pass_vals))))

Pass 1: 54 of 470 not matched (11.49%)


In [219]:
dt_list = (list(map(lambda x: x.replace('day',''),dt_list)))

pass_vals = list(map(dateparser.parse,dt_list))
matched = sum(list(map(lambda x: 1 if x == None else 0,pass_vals)))
print('Pass 2: {} of {} not matched ({:.2%})'.format(*(matched,len(pass_vals),matched/len(pass_vals))))

Pass 2: 54 of 470 not matched (11.49%)


In [220]:
for i in range(len(pass_vals)):
  if pass_vals[i] == None:
    print(dt_list[i])

this 2nd january 2020'
12232019
'1st october 201st9'
'14th  of september 200'
september 200'
19th jan 19th98
29318
'31st  of july 2000 july  2000'

october 11996'

030105
december 14th 1997 december 1997'
32108
4282017

11410
august 9th 19th9th9th'
032406

1272020
1892008
32006
33116

november 19th 19th99'
482020
71811
february 20th 20th20th'
51712
this 15th july 1998'
april 2nd 2nd02nd0'
130705
october 1st 1st999
'january 20th 20th14'





31418
'1st august 201st9'

march 20th 20th20th'
may 8th 2014 may 2014'
march 121999'
march 20th 20th20th'

11402
050598
9282004
92904
'2nd april 2nd02nd0'
9242018


In [189]:
!pip install num2words
from num2words import num2words

Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[?25l[K     |███▎                            | 10 kB 26.3 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 25.2 MB/s eta 0:00:01[K     |█████████▊                      | 30 kB 18.7 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 16.2 MB/s eta 0:00:01[K     |████████████████▏               | 51 kB 7.9 MB/s eta 0:00:01[K     |███████████████████▍            | 61 kB 7.8 MB/s eta 0:00:01[K     |██████████████████████▋         | 71 kB 8.9 MB/s eta 0:00:01[K     |█████████████████████████▉      | 81 kB 9.4 MB/s eta 0:00:01[K     |█████████████████████████████   | 92 kB 9.7 MB/s eta 0:00:01[K     |████████████████████████████████| 101 kB 5.2 MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10


30th of june 2016


'30th'