# Exploratory Data Analysis

In [None]:
from zipfile import ZipFile
import json
import os
from collections import Counter
import matplotlib.pyplot as plt
import re
from random import sample, choice
import numpy as np
import pandas as pd

## The Data

CUAD: An Expert-Annotated NLP Dataset for Legal Contract Review

https://arxiv.org/abs/2103.06268

This code is an adaptation of the scrape.py file avaliable on the github repository for CUAD.  It has been adapted to run in Jypter notebooks and allow us to step throght the coding line by line.`

### Download repository and extract data

In [None]:
def extract_zip(pth,data_pth = None):
    """Function to extract contents of a zip file to a specified location (wd if data_pth not passed)"""
    with ZipFile(pth, 'r') as zipObj:
       # Extract all the contents of zip file in different directory
       zipObj.extractall(data_pth)

In [None]:
#Download CUAD git repository
if not os.path.exists('main.zip'):
  !wget --no-check-certificate https://github.com/TheAtticusProject/cuad/archive/refs/heads/main.zip
  !unzip -q main.zip

#If it has not already been extracted, extract the contents of data.zip
if not os.path.exists('cuad-main/data'):
  os.makedirs('cuad-main/data')

if not os.path.exists('cuad-main/data/CUADv1.json'):
  extract_zip('cuad-main/data.zip','cuad-main/data/')



In [None]:
#Download a manualy curated set of labels for the full CUAD data. 
if not os.path.exists('labels3.txt'):
  !wget https://raw.githubusercontent.com/RichardMWarburton/ExploringCUAD/Exporatory-Analysis/labels3.txt

### Load CUADv1.json data

In [None]:
#Load CUADv1 JSON to data
with open('cuad-main/data/CUADv1.json','r') as infile:
    for line in infile:
        contract_data = json.loads(line)

#### Examples of data within the data dictionary

In [None]:
print('The data JSON has the following keys:\n\n',contract_data.keys())

The data JSON has the following keys:

 dict_keys(['version', 'data'])


The version key contains a single text reference outlining the version of the data

In [None]:
print('Version of data:', contract_data['version'])

Version of data: aok_v1.0


The data key contains a subsequent dictionary for each contract in the data set

In [None]:
print('There are {} data points in the \'data\' key inf the contract_data dictionary'.format(len(contract_data['data'])))

There are 510 data points in the 'data' key inf the contract_data dictionary


Each data point consits of a dictioary with two keys

In [None]:
#Choose random datapoint / contract
num_contracts = len(contract_data['data'])
m = choice(range(num_contracts))

print(contract_data['data'][m].keys())

dict_keys(['title', 'paragraphs'])


Title consists of a single string outlining the title of the data point

In [None]:
print('Title of data point {}:'.format(m).upper(),contract_data['data'][m]['title'])

TITLE OF DATA POINT 8: DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement


The 'paragraphs' key contains a list with a single item.

In [None]:
print('Summary of paragraphs key for data point {}:\n'.format(m))
print('Data type of \'paragraphs\' key:'.upper(),type(contract_data['data'][m]['paragraphs']))
print('Length type of \'paragraphs\' key:'.upper(),len(contract_data['data'][m]['paragraphs']))

Summary of paragraphs key for data point 8:

DATA TYPE OF 'PARAGRAPHS' KEY: <class 'list'>
LENGTH TYPE OF 'PARAGRAPHS' KEY: 1


For rigour, it can be quickly shown that the paragraph keys across all data points have length 1

In [None]:
#Initate counter dictionary to store lengths of paragraph key
len_count = Counter()

#For each data point
for i in range(num_contracts):
  #determine the length of the paragrah key and increment respective counter
  len_count[len(contract_data['data'][i]['paragraphs'])]+=1

#Output Count analysis
print(len_count)

Counter({1: 510})


This list contains a single data point, which is a dictionary consisting of two keys

In [None]:
print('Data type:'.upper(), type(contract_data['data'][m]['paragraphs'][0]))
print(('Keys in paragraphs:'.upper(),contract_data['data'][m]['paragraphs'][0].keys()))

DATA TYPE: <class 'dict'>
('KEYS IN PARAGRAPHS:', dict_keys(['qas', 'context']))


The 'Context' key appears to hold the full text data for the contract parsed in to a clean format

In [None]:
#Output first 2500 characters of context
contract_data['data'][m]['paragraphs'][0]['context'][:2500]

'Exhibit 10.2\n\n______________________________________________________________________________\n\nCO-PROMOTION AGREEMENT\n\nby and between\n\nDOVA PHARMACEUTICALS, INC.\n\nand\n\nVALEANT PHARMACEUTICALS NORTH AMERICA LLC\n\nSeptember 26, 2018\n\n______________________________________________________________________________\n\nCONFIDENTIAL TREATMENT HAS BEEN REQUESTED FOR PORTIONS OF THIS EXHIBIT. THE COPY FILED HEREWITH OMITS THE INFORMATION SUBJECT TO A CONFIDENTIALITY REQUEST. OMISSIONS ARE DESIGNATED [***]. A COMPLETE VERSION OF THIS EXHIBIT HAS BEEN FILED SEPARATELY WITH THE SECURITIES AND EXCHANGE COMMISSION.\n\nSource: DOVA PHARMACEUTICALS INC., 10-Q, 11/8/2018\n\n\n\n\n\nTABLE OF CONTENTS\n\nPage\n\nARTICLE 1 DEFINITIONS 1\n\nARTICLE 2 RIGHTS AND OBLIGATIONS 8\n\n2.1 Engagement; Grant of Rights. 8\n\n2.2 Retention of Rights. 9\n\n2.3 Non-Competition; Non-Solicitation. 9\n\n2.4 Dova Trademarks and Copyrights. 10\n\nARTICLE 3 JOINT STEERING COMMITTEE 11\n\n3.1 Formation of the JS

In [None]:
for key in contract_data['data'][m]['paragraphs'][0]:
  print('Data type of {}:'.format(key).upper(),type(contract_data['data'][m]['paragraphs'][0][key]))

DATA TYPE OF QAS: <class 'list'>
DATA TYPE OF CONTEXT: <class 'str'>


In [None]:
print('There are {} items in the \'qas\' list in the data dictionary'.format(len(contract_data['data'][m]['paragraphs'][0]['qas'])))

There are 41 items in the 'qas' list in the data dictionary


It can be easily shown that each data point has 41 items in the 'qas' key. 

In [None]:
#Initate counter dictionary to store lengths of paragraph key
item_count = Counter()

#For each data point
for i in range(num_contracts):
  #determine the length of the paragrah key and increment respective counter
  item_count[len(contract_data['data'][i]['paragraphs'][0]['qas'])]+=1

#Output Count analysis
print(item_count)

Counter({41: 510})


This is the total number of labels to be expected.  As we are not expecting to find all of the lables in each of the contracts, there will be an additional dat apoint indicating if the label is found  

In [None]:
num_clauses = 41
n = 1#choice(range(num_clauses))

print('An example of clause {} in contract data point {}:\n'.format(*(n,m)))

for key in contract_data['data'][m]['paragraphs'][0]['qas'][n]:
  print(key.upper()+':',contract_data['data'][m]['paragraphs'][0]['qas'][n][key])

An example of clause 1 in contract data point 8:

ANSWERS: [{'text': 'Valeant', 'answer_start': 1131}, {'text': 'Dova Pharmaceuticals, Inc.', 'answer_start': 4972}, {'text': 'Dova', 'answer_start': 857}, {'text': 'Dova and Valeant are each referred to individually as a "Party" and together as the "Parties".', 'answer_start': 5130}, {'text': 'Valeant Pharmaceuticals North America LLC', 'answer_start': 5037}]
ID: DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement__Parties
QUESTION: Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract
IS_IMPOSSIBLE: False


We see that the 'is_possible' field provides a True/False view of the clause being found.  This will allow us to reconsice the number of clauses in the data vs. the published paper

In [None]:
#Initate Counter
clause_counter = Counter()

for i in range(num_contracts):
  for j in range(num_clauses):
    clause_counter[contract_data['data'][i]['paragraphs'][0]['qas'][j]['is_impossible']] +=1

print(clause_counter)

Counter({True: 14208, False: 6702})


This count falls short of the 13101 expected in the paper.  Investigating the 'answers' key we can see that it is possible for there to be multiple answers to eash question (i.e. pultiple labels per clause per contract)

In [None]:
contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers']

[{'answer_start': 1131, 'text': 'Valeant'},
 {'answer_start': 4972, 'text': 'Dova Pharmaceuticals, Inc.'},
 {'answer_start': 857, 'text': 'Dova'},
 {'answer_start': 5130,
  'text': 'Dova and Valeant are each referred to individually as a "Party" and together as the "Parties".'},
 {'answer_start': 5037, 'text': 'Valeant Pharmaceuticals North America LLC'}]

Allowing for this in the count code, we get:

In [None]:
#Initate Counter
clause_counter = Counter()

for i in range(num_contracts):
  for j in range(num_clauses):
    for k in range(len(contract_data['data'][i]['paragraphs'][0]['qas'][j]['answers'])):
      clause_counter[contract_data['data'][i]['paragraphs'][0]['qas'][j]['is_impossible']] +=1

print(clause_counter)

Counter({False: 13823})


This total is closer to the expected total of 13,101 from the paper.  We will procede with the total from this analysis, howeve caution will be taken as there may be underlying issues for the discrepency that need adressing

**NOTE**: as k = 0 if no clauses are found, the above will not count the impossible labels

In [None]:
print('An example of clause {} in contract data point {}:\n'.format(*(n,m)))

for key in contract_data['data'][m]['paragraphs'][0]['qas'][n]:
  print(key.upper()+':',contract_data['data'][m]['paragraphs'][0]['qas'][n][key])

An example of clause 1 in contract data point 8:

ANSWERS: [{'text': 'Valeant', 'answer_start': 1131}, {'text': 'Dova Pharmaceuticals, Inc.', 'answer_start': 4972}, {'text': 'Dova', 'answer_start': 857}, {'text': 'Dova and Valeant are each referred to individually as a "Party" and together as the "Parties".', 'answer_start': 5130}, {'text': 'Valeant Pharmaceuticals North America LLC', 'answer_start': 5037}]
ID: DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement__Parties
QUESTION: Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract
IS_IMPOSSIBLE: False


The 'question' key provides a view of the question posed to the laywer for annotation

In [None]:
print(contract_data['data'][m]['paragraphs'][0]['qas'][n]['question'])

Highlight the parts (if any) of this contract related to "Parties" that should be reviewed by a lawyer. Details: The two or more parties who signed the contract


The 'id' key provides a concatenation of the contract title and the clause in question

In [None]:
print(contract_data['data'][m]['paragraphs'][0]['qas'][n]['id'])

DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement__Parties


This can easily be split by noting that a double underscore '__' seperates the values

In [None]:
print(contract_data['data'][m]['paragraphs'][0]['qas'][n]['id'].split(sep='__'))

['DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement', 'Parties']


The answers key consistso of a list of dictionaries.  If a clause is found in the contract the respective annotation will be added to this list.  If multiple annotations are made, the list will have a legnth > 1

**NOTE:** If no clauses are found, this will have length = 0

In [None]:
print('Data Type:'.upper(),type(contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers']))
print('Data length:'.upper(),len(contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers']))

DATA TYPE: <class 'list'>
DATA LENGTH: 5


Each entry in this list (if one exists) consists of a dictioary with two keys

In [None]:
print(type(contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers'][0]))
print(contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers'][0].keys())

<class 'dict'>
dict_keys(['text', 'answer_start'])


In [None]:
for key in contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers'][0].keys():
  print(key.upper() + ':',contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers'][0][key])

TEXT: Valeant
ANSWER_START: 1131


The 'text' key provides the annotated text extracted from the contract and the 'answer_start' key, the character start point in the 'context' data outlined above.  

The code below demonstrates that the text field above can be parsed from the context data using the answer_start data.  

In [None]:
#Define start and end point for annotation in context data
extract_len = len(contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers'][0]['text'])
extract_start = contract_data['data'][m]['paragraphs'][0]['qas'][n]['answers'][0]['answer_start']
extract_end = extract_start + extract_len

#Obtain the context data for the specific contract
contract_context = contract_data['data'][m]['paragraphs'][0]['context']

#Output the parsed string from the context
contract_context[extract_start:extract_end]

'Valeant'