# LEGAL CONTRACT CLASSIFICATION:

## STEP 1: Perform Text Extraction for Contract PDF

In [1]:
!pip install PyPDF2



In [2]:
import PyPDF2 as pdf

Below contract being extracted is an Assignment Order Contract

In [3]:
file = open('Contract-AO.pdf', 'rb')

In [4]:
file

<_io.BufferedReader name='Contract-AO.pdf'>

In [5]:
pdf_reader = pdf.PdfFileReader(file)

In [6]:
pdf_reader

<PyPDF2.pdf.PdfFileReader at 0x1c33d2a6088>

In [7]:
pdf_reader.getIsEncrypted()

False

In [8]:
pdf_reader.getNumPages()

1

In [9]:
page1 = pdf_reader.getPage(0)

In [10]:
page1.extractText()

'FAS\n-\nXXX\n-\n001\n \n \nCONFIDENTIAL\n \n \nPage \n1\n \nof \n1\n \n<Date>\n \n \nAssignment Order\n \n \n\nAssignment Order\n\n\nAO\n\nunder the terms and conditions of the \nProfessional Services Agreement \n\nbetween \n<\nCompany \nName>\n \n\nand \nForgeahead Solutions\n, Inc.\n \n\n<da\nte>\n. \nUnder this \nAO\n, \nConsultant\n \nwill \nprovide services as defined below in Work Activities.  The expected period of performance will be \nfrom \n<Start Date>\n \nthrough \n<End Date>\n.\n \nThis Assignment Order forms and shall always be deemed to have formed an integral pa\nrt of the \nAgreement. In the event of any conflict or inconsistency between the provisions of the Agreement \nand this Assignment Order, the provisions of this Assignment Order will prevail.\n \n \nProject Name\n \n<Project Name if any>\n \n \nAO\n# \n \nFor billing and accounting\n \npurposes, this \nAO\n \nwill be referred to as \nFAS\n-\nXXX\n-\n001\n \n \nWork Activities\n \nThe work activity in which \nC

## STEP 2: Convert Extracted Text to Lowercase String for Preprocessing

In [11]:
Page1 = str(page1.extractText())
Page1 = Page1.lower()

In [12]:
Page1

'fas\n-\nxxx\n-\n001\n \n \nconfidential\n \n \npage \n1\n \nof \n1\n \n<date>\n \n \nassignment order\n \n \n\nassignment order\n\n\nao\n\nunder the terms and conditions of the \nprofessional services agreement \n\nbetween \n<\ncompany \nname>\n \n\nand \nforgeahead solutions\n, inc.\n \n\n<da\nte>\n. \nunder this \nao\n, \nconsultant\n \nwill \nprovide services as defined below in work activities.  the expected period of performance will be \nfrom \n<start date>\n \nthrough \n<end date>\n.\n \nthis assignment order forms and shall always be deemed to have formed an integral pa\nrt of the \nagreement. in the event of any conflict or inconsistency between the provisions of the agreement \nand this assignment order, the provisions of this assignment order will prevail.\n \n \nproject name\n \n<project name if any>\n \n \nao\n# \n \nfor billing and accounting\n \npurposes, this \nao\n \nwill be referred to as \nfas\n-\nxxx\n-\n001\n \n \nwork activities\n \nthe work activity in which \nc

## STEP 3: Convert String to List Data Type so that we can iterate over the corpus

In [13]:
list1 = Page1.split()

In [14]:
list1

['fas',
 '-',
 'xxx',
 '-',
 '001',
 'confidential',
 'page',
 '1',
 'of',
 '1',
 '<date>',
 'assignment',
 'order',
 'assignment',
 'order',
 'ao',
 'under',
 'the',
 'terms',
 'and',
 'conditions',
 'of',
 'the',
 'professional',
 'services',
 'agreement',
 'between',
 '<',
 'company',
 'name>',
 'and',
 'forgeahead',
 'solutions',
 ',',
 'inc.',
 '<da',
 'te>',
 '.',
 'under',
 'this',
 'ao',
 ',',
 'consultant',
 'will',
 'provide',
 'services',
 'as',
 'defined',
 'below',
 'in',
 'work',
 'activities.',
 'the',
 'expected',
 'period',
 'of',
 'performance',
 'will',
 'be',
 'from',
 '<start',
 'date>',
 'through',
 '<end',
 'date>',
 '.',
 'this',
 'assignment',
 'order',
 'forms',
 'and',
 'shall',
 'always',
 'be',
 'deemed',
 'to',
 'have',
 'formed',
 'an',
 'integral',
 'pa',
 'rt',
 'of',
 'the',
 'agreement.',
 'in',
 'the',
 'event',
 'of',
 'any',
 'conflict',
 'or',
 'inconsistency',
 'between',
 'the',
 'provisions',
 'of',
 'the',
 'agreement',
 'and',
 'this',
 'assi

## STEP 4: Perform Stemming/ Lemmitization to Convert Strings in List to Root Form

In [15]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
stemmed_list = [stemmer.stem(token) for token in list1]
print(' '.join(stemmed_list))

fas - xxx - 001 confidenti page 1 of 1 <date> assign order assign order ao under the term and condit of the profession servic agreement between < compani name> and forgeahead solut , inc. <da te> . under this ao , consult will provid servic as defin below in work activities. the expect period of perform will be from <start date> through <end date> . this assign order form and shall alway be deem to have form an integr pa rt of the agreement. in the event of ani conflict or inconsist between the provis of the agreement and this assign order, the provis of this assign order will prevail. project name <project name if any> ao # for bill and account purposes, this ao will be refer to as fas - xxx - 001 work activ the work activ in which compani expect consult to be engag is categor as follows: <natur of servic > compris a team of : 1. <team size> work hour each month will consi t of the actual number of work day in that month and each week to consist of a maximum of 5 work day (monday thro

In [16]:
stemmed_list

['fas',
 '-',
 'xxx',
 '-',
 '001',
 'confidenti',
 'page',
 '1',
 'of',
 '1',
 '<date>',
 'assign',
 'order',
 'assign',
 'order',
 'ao',
 'under',
 'the',
 'term',
 'and',
 'condit',
 'of',
 'the',
 'profession',
 'servic',
 'agreement',
 'between',
 '<',
 'compani',
 'name>',
 'and',
 'forgeahead',
 'solut',
 ',',
 'inc.',
 '<da',
 'te>',
 '.',
 'under',
 'this',
 'ao',
 ',',
 'consult',
 'will',
 'provid',
 'servic',
 'as',
 'defin',
 'below',
 'in',
 'work',
 'activities.',
 'the',
 'expect',
 'period',
 'of',
 'perform',
 'will',
 'be',
 'from',
 '<start',
 'date>',
 'through',
 '<end',
 'date>',
 '.',
 'this',
 'assign',
 'order',
 'form',
 'and',
 'shall',
 'alway',
 'be',
 'deem',
 'to',
 'have',
 'form',
 'an',
 'integr',
 'pa',
 'rt',
 'of',
 'the',
 'agreement.',
 'in',
 'the',
 'event',
 'of',
 'ani',
 'conflict',
 'or',
 'inconsist',
 'between',
 'the',
 'provis',
 'of',
 'the',
 'agreement',
 'and',
 'this',
 'assign',
 'order,',
 'the',
 'provis',
 'of',
 'this',
 'assi

In [17]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_list = [lemmatizer.lemmatize(token) for token in list1]
print(' '.join(lemmatized_list))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rohaan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


fa - xxx - 001 confidential page 1 of 1 <date> assignment order assignment order ao under the term and condition of the professional service agreement between < company name> and forgeahead solution , inc. <da te> . under this ao , consultant will provide service a defined below in work activities. the expected period of performance will be from <start date> through <end date> . this assignment order form and shall always be deemed to have formed an integral pa rt of the agreement. in the event of any conflict or inconsistency between the provision of the agreement and this assignment order, the provision of this assignment order will prevail. project name <project name if any> ao # for billing and accounting purposes, this ao will be referred to a fa - xxx - 001 work activity the work activity in which company expects consultant to be engaged is categorized a follows: <nature of service > comprising a team of : 1. <team size> work hour each month will consis t of the actual number of 

In [18]:
lemmatized_list

['fa',
 '-',
 'xxx',
 '-',
 '001',
 'confidential',
 'page',
 '1',
 'of',
 '1',
 '<date>',
 'assignment',
 'order',
 'assignment',
 'order',
 'ao',
 'under',
 'the',
 'term',
 'and',
 'condition',
 'of',
 'the',
 'professional',
 'service',
 'agreement',
 'between',
 '<',
 'company',
 'name>',
 'and',
 'forgeahead',
 'solution',
 ',',
 'inc.',
 '<da',
 'te>',
 '.',
 'under',
 'this',
 'ao',
 ',',
 'consultant',
 'will',
 'provide',
 'service',
 'a',
 'defined',
 'below',
 'in',
 'work',
 'activities.',
 'the',
 'expected',
 'period',
 'of',
 'performance',
 'will',
 'be',
 'from',
 '<start',
 'date>',
 'through',
 '<end',
 'date>',
 '.',
 'this',
 'assignment',
 'order',
 'form',
 'and',
 'shall',
 'always',
 'be',
 'deemed',
 'to',
 'have',
 'formed',
 'an',
 'integral',
 'pa',
 'rt',
 'of',
 'the',
 'agreement.',
 'in',
 'the',
 'event',
 'of',
 'any',
 'conflict',
 'or',
 'inconsistency',
 'between',
 'the',
 'provision',
 'of',
 'the',
 'agreement',
 'and',
 'this',
 'assignment',


## STEP 5: Remove stopwords from corpus

In [19]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
", ".join(stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"what, against, no, wasn't, mustn, shan, d, hers, my, being, did, weren, myself, was, am, wouldn't, doing, ve, ours, shouldn, s, but, ourselves, into, needn, me, isn't, mightn, wouldn, just, herself, doesn, her, had, an, now, ain, been, all, those, will, o, only, didn, for, them, while, hadn't, you'd, is, on, won't, that'll, at, by, after, weren't, m, haven't, down, once, during, can, until, you, in, more, this, be, or, than, t, some, couldn't, very, they, you've, then, own, were, you'll, are, to, haven, such, the, your, between, re, which, under, theirs, ll, when, shan't, from, she's, both, not, other, won, their, as, because, we, how, has, isn, nor, where, any, themselves, who, ma, aren, it, our, y, same, over, himself, mightn't, do, a, couldn, hasn't, don, it's, does, itself, whom, and, his, before, that, wasn, too, have, should've, so, these, up, with, aren't, out, she, again, why, few, mustn't, shouldn't, yourselves, most, he, yourself, further, should, its, yours, of, i, if, didn

In [20]:
preprocessedAO = [token for token in lemmatized_list if token not in stop]
" ".join(preprocessedAO)

'fa - xxx - 001 confidential page 1 1 <date> assignment order assignment order ao term condition professional service agreement < company name> forgeahead solution , inc. <da te> . ao , consultant provide service defined work activities. expected period performance <start date> <end date> . assignment order form shall always deemed formed integral pa rt agreement. event conflict inconsistency provision agreement assignment order, provision assignment order prevail. project name <project name any> ao # billing accounting purposes, ao referred fa - xxx - 001 work activity work activity company expects consultant engaged categorized follows: <nature service > comprising team : 1. <team size> work hour month consis actual number working day month week consist maximum 5 working day (monday friday) maximum 45 hours. company shall grant consultant f company declared holiday 21 day personal vacation calendar year would deduction payment / invoice. overtime separately identified charge pro - ra

In [21]:
preprocessedAO

['fa',
 '-',
 'xxx',
 '-',
 '001',
 'confidential',
 'page',
 '1',
 '1',
 '<date>',
 'assignment',
 'order',
 'assignment',
 'order',
 'ao',
 'term',
 'condition',
 'professional',
 'service',
 'agreement',
 '<',
 'company',
 'name>',
 'forgeahead',
 'solution',
 ',',
 'inc.',
 '<da',
 'te>',
 '.',
 'ao',
 ',',
 'consultant',
 'provide',
 'service',
 'defined',
 'work',
 'activities.',
 'expected',
 'period',
 'performance',
 '<start',
 'date>',
 '<end',
 'date>',
 '.',
 'assignment',
 'order',
 'form',
 'shall',
 'always',
 'deemed',
 'formed',
 'integral',
 'pa',
 'rt',
 'agreement.',
 'event',
 'conflict',
 'inconsistency',
 'provision',
 'agreement',
 'assignment',
 'order,',
 'provision',
 'assignment',
 'order',
 'prevail.',
 'project',
 'name',
 '<project',
 'name',
 'any>',
 'ao',
 '#',
 'billing',
 'accounting',
 'purposes,',
 'ao',
 'referred',
 'fa',
 '-',
 'xxx',
 '-',
 '001',
 'work',
 'activity',
 'work',
 'activity',
 'company',
 'expects',
 'consultant',
 'engaged',
 'c

## STEP 6: Do the similar process for other contract types in order to build your Database

Below contract being extracted is a Non Disclosure Agreement Contract

In [22]:
file = open('Contract-NDA.pdf', 'rb')
pdf_reader = pdf.PdfFileReader(file)
pdf_reader.getNumPages()
page1 = pdf_reader.getPage(0)
Page1 = str(page1.extractText())
Page1 = Page1.lower()
list1 = Page1.split()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_list = [lemmatizer.lemmatize(token) for token in list1]
preprocessedNDA = [token for token in lemmatized_list if token not in stop]
" ".join(preprocessedNDA)

'confidentiality agreement effective date: agreement made forgeahead solution pvt . ltd . , suite 218, b1 - cerebrum park, kalyani nagar, pune 411014 (india) ffiliates forgeahead ( whereas , forgeahead company mutually desire engage discussion may lead business relationship involving companies; whereas , party course dealing may furnish "confidential information" defined paragraph 1 wish convey interest copyright therein other, make confidential information public common knowledge; disclosed third party, permit use reof except engage discussion now, therefore , consideration business discussions, disclosure confidential information future business relationship parties, hereby agreed follows: 1. confidential inform ation. purpose agreement, term "confidential information" shall mean following: information, business plan, concept, idea, know - how, process, technique, program, design, formula, algorithm work - - process, engineering, manu facturing, marketing, technical, financial data, 

In [23]:
preprocessedNDA

['confidentiality',
 'agreement',
 'effective',
 'date:',
 'agreement',
 'made',
 'forgeahead',
 'solution',
 'pvt',
 '.',
 'ltd',
 '.',
 ',',
 'suite',
 '218,',
 'b1',
 '-',
 'cerebrum',
 'park,',
 'kalyani',
 'nagar,',
 'pune',
 '411014',
 '(india)',
 'ffiliates',
 'forgeahead',
 '(',
 'whereas',
 ',',
 'forgeahead',
 'company',
 'mutually',
 'desire',
 'engage',
 'discussion',
 'may',
 'lead',
 'business',
 'relationship',
 'involving',
 'companies;',
 'whereas',
 ',',
 'party',
 'course',
 'dealing',
 'may',
 'furnish',
 '"confidential',
 'information"',
 'defined',
 'paragraph',
 '1',
 'wish',
 'convey',
 'interest',
 'copyright',
 'therein',
 'other,',
 'make',
 'confidential',
 'information',
 'public',
 'common',
 'knowledge;',
 'disclosed',
 'third',
 'party,',
 'permit',
 'use',
 'reof',
 'except',
 'engage',
 'discussion',
 'now,',
 'therefore',
 ',',
 'consideration',
 'business',
 'discussions,',
 'disclosure',
 'confidential',
 'information',
 'future',
 'business',
 'rel

Below contract being extracted is a Work Schedule Contract

In [24]:
file = open('Contract-WS.pdf', 'rb')
pdf_reader = pdf.PdfFileReader(file)
pdf_reader.getNumPages()
page1 = pdf_reader.getPage(0)
Page1 = str(page1.extractText())
Page1 = Page1.lower()
list1 = Page1.split()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_list = [lemmatizer.lemmatize(token) for token in list1]
preprocessedWS = [token for token in lemmatized_list if token not in stop]
" ".join(preprocessedWS)

'fa - xxx - 001 confidential page 1 1 <date> work schedule letter serf work schedule sow term condition master service agreement <client name> client forgea head solution , inc. provider <date>. sow , provider provide service defined work activities. expected period performance <start date> <end date> . work schedule form shall always deemed formed integral part agreem ent. event conflict inconsistency provision agreement work schedule , provision work schedule prevail. project name <project name any> sow # billing accounting purposes, sow w ill referred fa - xxx - 001 work activity work activity client expects provider engaged categorized follows: <nature service > comprising team : 1. <team detail > project fee provider accrue earnings rat e <rate figure> (us dollar <in words> only) invoicing payment project fee & expense provider may submit invoice project fee end calendar month. provider may submit invoice reimbursement expense immediately n meeting expenses. payment made per term 

In [25]:
preprocessedWS

['fa',
 '-',
 'xxx',
 '-',
 '001',
 'confidential',
 'page',
 '1',
 '1',
 '<date>',
 'work',
 'schedule',
 'letter',
 'serf',
 'work',
 'schedule',
 'sow',
 'term',
 'condition',
 'master',
 'service',
 'agreement',
 '<client',
 'name>',
 'client',
 'forgea',
 'head',
 'solution',
 ',',
 'inc.',
 'provider',
 '<date>.',
 'sow',
 ',',
 'provider',
 'provide',
 'service',
 'defined',
 'work',
 'activities.',
 'expected',
 'period',
 'performance',
 '<start',
 'date>',
 '<end',
 'date>',
 '.',
 'work',
 'schedule',
 'form',
 'shall',
 'always',
 'deemed',
 'formed',
 'integral',
 'part',
 'agreem',
 'ent.',
 'event',
 'conflict',
 'inconsistency',
 'provision',
 'agreement',
 'work',
 'schedule',
 ',',
 'provision',
 'work',
 'schedule',
 'prevail.',
 'project',
 'name',
 '<project',
 'name',
 'any>',
 'sow',
 '#',
 'billing',
 'accounting',
 'purposes,',
 'sow',
 'w',
 'ill',
 'referred',
 'fa',
 '-',
 'xxx',
 '-',
 '001',
 'work',
 'activity',
 'work',
 'activity',
 'client',
 'expects'

In [28]:
file = open('Contract-MSA.pdf', 'rb')
pdf_reader = pdf.PdfFileReader(file)
count = pdf_reader.numPages
for i in range(count):
    page = pdf_reader.getPage(i)
    output = page.extractText()
output

'Page \n14\n \nof \n14\n \n14.7\n \nCounterparts. \nThis Agreement may be executed in several counterparts, each of which will be \ndeemed an orig\ninal, and all of which taken together will constitute one single Agreement between the \nParties with the same effect as if all the signatures were upon the same instrument.\n \n14.8\n \nEntire Agreement\n. This Agreement and all appendices and other exhibits attached\n \nhereto \nconstitute the complete and exclusive statement of the agreement between the Parties and supersedes all \nproposals, oral or written, and all other prior or contemporaneous communications and agreement \nbetween the Parties relating to the subject matt\ner herein. \n \n14.9\n \nForce Majeure.\n  \nIn the event that either Party is prevented from performing or is unable to \nperform any of its obligations under this Agreement (other than a payment obligation) due to any act of \nfire, casualty, flood, earthquake, war, epi\ndemic, destruction of production facilitie