<a href="https://colab.research.google.com/github/RichardMWarburton/ExploringCUAD/blob/Dev/Date%20Parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Individual Clause Investigation (Agreement Date)




## The Data

CUAD: An Expert-Annotated NLP Dataset for Legal Contract Review

https://arxiv.org/abs/2103.06268

This code is an adaptation of the scrape.py file avaliable on the github repository for CUAD.  It has been adapted to run in Jypter notebooks and allow us to step throght the coding line by line.`

## 1: Import Packages & Define Useful Functions

In [35]:
from zipfile import ZipFile
import json
import os
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import re
from random import sample, choice
import numpy as np
import pandas as pd
import re
import string
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.cluster import AgglomerativeClustering
from pprint import pprint

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

!pip install dateparser
import dateparser

!pip install num2words
from num2words import num2words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def extract_zip(pth,data_pth = None):
    """Function to extract contents of a zip file to a specified location (wd if data_pth not passed)"""
    with ZipFile(pth, 'r') as zipObj:
       # Extract all the contents of zip file in different directory
       zipObj.extractall(data_pth)

## 2: Download repository and extract data

In [3]:
#Download CUAD git repository
if not os.path.exists('main.zip'):
  !wget --no-check-certificate https://github.com/TheAtticusProject/cuad/archive/refs/heads/main.zip
  !unzip -q main.zip

#If it has not already been extracted, extract the contents of data.zip
if not os.path.exists('cuad-main/data'):
  os.makedirs('cuad-main/data')

if not os.path.exists('cuad-main/data/CUADv1.json'):
  extract_zip('cuad-main/data.zip','cuad-main/data/')

#Download a manualy curated set of labels for the full CUAD data. 
if not os.path.exists('labels3.txt'):
  !wget https://raw.githubusercontent.com/RichardMWarburton/ExploringCUAD/main/labels3.txt

--2021-07-20 11:27:46--  https://github.com/TheAtticusProject/cuad/archive/refs/heads/main.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/TheAtticusProject/cuad/zip/refs/heads/main [following]
--2021-07-20 11:27:46--  https://codeload.github.com/TheAtticusProject/cuad/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 140.82.113.10
Connecting to codeload.github.com (codeload.github.com)|140.82.113.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [   <=>              ]  17.77M  19.8MB/s    in 0.9s    

2021-07-20 11:27:48 (19.8 MB/s) - ‘main.zip’ saved [18631176]

--2021-07-20 11:27:48--  https://raw.githubusercontent.com/RichardMWarburton/ExploringCUAD/main/labels3.txt
Resolving raw.githubusercont

In [4]:
#Load CUADv1 JSON to data
with open('cuad-main/data/CUADv1.json','r') as infile:
    for line in infile:
        contract_data = json.loads(line)

### 2.1: Read in Label Data & Generate Look Up Dictionary

In [5]:
#Initate storage for labels look up (LU)
labels_LU = {}

#Read in labels data
with open('labels3.txt','r',encoding ='UTF-8') as infile:
  for line in infile:
    #Remove trailing special characters and split on tab
    data = line.strip().split(sep='\t')
    #Add name and label to labels_LU dictionary
    labels_LU[data[0]] = data[1]

The look up returns one error, most likely due to the accented E and a disparity of encoding.  This will be forced to 'Marketing Agreement' manually for now (EITHER SORT OR PROVIDE EXAMPLE)

### 2.2: Extract Raw Contract Data

In [6]:
#ser reg ex expression for characters to remove from contract contest
spec_chars = '\\n|\\t|\\t'

#Set number of contracts in data
num_contracts = len(contract_data['data'])

#Initate dictionary to store raw contract data
raw_contracts = defaultdict(list)

#for each contract
for i in range(num_contracts):
  #Append the title, contract text and character length of text to the raw_contracts dictionary
  raw_contracts['contract title'].append(contract_data['data'][i]['title'])
  raw_contracts['label'].append(labels_LU[contract_data['data'][i]['title']] if contract_data['data'][i]['title'] in labels_LU else 'marketing agreement' ) #<- manual error trap applied here (see below)
  
  #Parse raw text and process to remove breaks
  raw_text = contract_data['data'][i]['paragraphs'][0]['context']
  clean_text = re.sub(spec_chars,'',raw_text)

  #Split clean text in to sentances and tokens
  sentance_text = clean_text.split(sep = '. ')
  token_text = clean_text.split(sep = ' ')

  #Append text to the respective key in the raw_contracts dictionary
  raw_contracts['raw text'].append(raw_text)
  raw_contracts['clean text'].append(clean_text)
  raw_contracts['sentance text'].append(sentance_text)
  raw_contracts['token text'].append(token_text)
  
  #Add character, sentance and token counts to raw_contracts dictionary
  raw_contracts['character count'].append(len(raw_text))
  raw_contracts['sentance count'].append(len(sentance_text))
  raw_contracts['token count'].append(len(token_text))


### 2.3: Extract Clause Specific Data

In [7]:
#Define the number of clauses
num_clauses = 41

#initate dictioanry to store caluse data
clause_data = defaultdict(list)

#For each contract
for i in range(num_contracts):
  #for each clause
  for j in range(num_clauses):
    #for each found clause annotation
    for k in range(len(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'])): 
      #Add the contract title
      clause_data['contract title'].append(contract_data['data'][i]['title'])
      clause_data['label'].append(labels_LU[contract_data['data'][i]['title']] if contract_data['data'][i]['title'] in labels_LU else 'marketing agreement' )  #<- manual error trap applied here
      clause_data['clause'].append(contract_data['data'][i]['paragraphs'][0]['qas'][j]['id'].split(sep='__')[1])
      clause_data['annotation'].append(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'][k]['text'])
      clause_data['annotation start'].append(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'][k]['answer_start'])
      clause_data['annotation length'].append(len(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'][k]['text']))


## 3: Cleaning data and extracting a single clause

In [8]:
#Initate dataframe of all clause data
clause_df = pd.DataFrame(clause_data)

#Convert to lower case
clause_df['annotation'] = clause_df['annotation'].apply(lambda x: x.lower())

#Remove any formating characters or multiple spaces and replace with a single space
clause_df['annotation'] = clause_df['annotation'].apply(lambda x: re.sub('\\t|\\r|\\n|[^\S]{2,}',' ',x))

#Remove punctuation from the string
clause_df['annotation'] = clause_df['annotation'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [9]:
#Define clause of interest
clause_of_interest = 'Agreement Date'

#Limit df to clause of interest and extract annotations of itnerest
of_interest_data = clause_df[clause_df['clause'] == clause_of_interest]
annotations_of_interest = of_interest_data['annotation'].values

#Identify where there are multiple annotations per contract
titles,counts = np.unique(of_interest_data['contract title'],return_counts =True)
dups = titles[counts >= 2]

#Output Analysis
print('There are {} contracts with \'{}\' annotations'.format(*(titles.shape[0],clause_of_interest)))
print('There are {} contracts with more than one annotation'.format(dups.shape[0]))

There are 470 contracts with 'Agreement Date' annotations
There are 6 contracts with more than one annotation


In [10]:
#Initate memory for annotations within contracts
combined_annotations_list = defaultdict(list)
combined_annotations_string = {}

#For each annotation of interest found in the contract, 
#append annotation to a default dict list with contract as key
for i in of_interest_data.index:
  name = of_interest_data.loc[i,['contract title']].values[0]
  annotation = of_interest_data.loc[i,['annotation']].values[0]
  combined_annotations_list[name].append(annotation)

#Produce a singel string of all annotations found in specific contracts
for key in combined_annotations_list.keys():
  combined_annotations_string[key] = ' '.join(combined_annotations_list[key])

In [11]:
#Build array of contract names and concatenated annotations
contracts = np.array(list(combined_annotations_string.keys()))
combined_annotations = np.array(list(combined_annotations_string.values()))

## Section X: Parse Date Field

In [12]:
txt = combined_annotations[0]

In [13]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [37]:
#Define stopwords list
stop_words = stopwords.words('english')

In [15]:
def append_type_match(x, value_list,matches=[]):
  if x.label_ in matches:
    value_list.append(x)

In [16]:
def ordinalize_num(txt):
  for token in txt.split(sep=' '):
    try:
      if int(token) <= 31:
        ord_val = num2words(token, lang="en", to="ordinal_num")
        txt = txt.replace(token,ord_val)
    except:
      continue
  return txt

In [81]:
date_feature_list = []
not_mapped = []

nlp = en_core_web_sm.load()

for annotation in combined_annotations:

  annotation_list = []  
  #Clean annotaiton (typos etc)
  annotation = annotation.replace('  ',' ')
  annotation = ' '.join([word for word in annotation.split(sep=' ') if word not in stop_words])
  annotation = annotation.replace('t h','th')
  annotation = annotation.replace('s t','st')
  annotation = annotation.replace('day','')
  annotation = ordinalize_num(annotation)

  doc = nlp(repr(annotation))

  for X in doc.ents:
    if X.label_ in ['DATE']:
      annotation_list.append(X.text)
    else:
      annotation_list.append(X.text)
      not_mapped.append(annotation)

  date_feature_list.append(' '.join(annotation_list))


In [82]:
pass_vals = list(map(dateparser.parse,date_feature_list))
matched = sum(list(map(lambda x: 1 if x == None else 0,pass_vals)))
print('Pass 1: {} of {} not matched ({:.2%})'.format(*(matched,len(pass_vals),matched/len(pass_vals))))

Pass 1: 70 of 470 not matched (14.89%)


In [93]:
#Remove any years significantly in the future (assumed errors)
for i in range(len(pass_vals)):
  if pass_vals[i] != None:
    if pass_vals[i].year >2025:
      print(pass_vals[i])
      pass_vals[i] = None

In [109]:
bag_of_clauses = {}

for i in range(len(contracts)):
  if pass_vals[i] != None:
    bag_of_clauses[contracts[i]] = {clause_of_interest:pass_vals[i].timestamp()}

In [110]:
bag_of_clauses

{'2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement': {'Agreement Date': 929923200.0},
 'ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT': {'Agreement Date': 1275955200.0},
 'ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT': {'Agreement Date': 1105574400.0},
 'ADAPTIMMUNETHERAPEUTICSPLC_04_06_2017-EX-10.11-STRATEGIC ALLIANCE AGREEMENT': {'Agreement Date': 1474588800.0},
 'ADIANUTRITION,INC_04_01_2005-EX-10.D2-RESELLER AGREEMENT': {'Agreement Date': 1090281600.0},
 'ADMA BioManufacturing, LLC -  Amendment #3 to Manufacturing Agreement ': {'Agreement Date': 1513900800.0},
 'ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT': {'Agreement Date': 1590969600.0},
 'ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT(1)': {'Agreement Date': 1590969600.0},
 'AFSALABANCORPINC_08_01_1996-EX-1.1-AGENCY AGREEMENT': {'Agreement Date': 837820800.0},
 'AIRSPANNETWORKSINC_04_11_2000-EX-10.5-Distributor Agreement': {'Agreement Date':

In [108]:
len(list(bag_of_clauses.keys()))

396

array([1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020, 2021])