In [1]:
import fitz 
import joblib  
from sklearn.feature_extraction.text import TfidfVectorizer  
import nltk  
from nltk.corpus import stopwords
import spacy 
import re  
from sklearn.metrics.pairwise import cosine_similarity
import difflib

In [2]:
# Load the trained model and the TF-IDF vectorizer from disk
model = joblib.load('ensemble_model.pkl')
vectorizer = joblib.load('ensemble_model_tfidf_vectorizer.pkl')

In [3]:
# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

In [4]:
# Download stopwords from nltk if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

In [19]:
# Function to split text into paragraphs
def split_text_into_paragraphs(text):
    paragraphs = re.split(r'\n \n', text)
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    return paragraphs

In [8]:
# Function to clean and preprocess text
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    # Lemmatization using spaCy
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text

In [9]:
# Function to annotate paragraphs with predicted clause types
def annotate_paragraphs(paragraphs):
    annotated_paragraphs = []
    for paragraph in paragraphs:
        cleaned_paragraph = clean_text(paragraph)
        paragraph_vector = vectorizer.transform([cleaned_paragraph])
        clause_type = model.predict(paragraph_vector)
        annotated_paragraphs.append((paragraph, clause_type[0].capitalize()))
    return annotated_paragraphs

In [10]:
def highlight_mismatches(text1, text2):
    words1 = text1.split()
    words2 = text2.split()
    
    differ = difflib.Differ()
    diff = list(differ.compare(words1, words2))
    
    highlighted_text1 = ""
    highlighted_text2 = ""
    for word in diff:
        if word.startswith('  '):
            highlighted_text1 += word[2:] + " "
            highlighted_text2 += word[2:] + " "
        elif word.startswith('- '):
            highlighted_text1 += f"<mark>{word[2:]}</mark> "
        elif word.startswith('+ '):
            highlighted_text2 += f"<mark>{word[2:]}</mark> "
            
    return highlighted_text1.strip(), highlighted_text2.strip()

In [11]:
# Paths to your PDF files for both contracts
pdf_path_first_contract = 'Joint_Venture_1.pdf'
pdf_path_second_contract = 'Joint_Venture_2.pdf'

In [20]:
# Extract text from the first contract
first_contract_text = extract_text_from_pdf(pdf_path_first_contract)
first_contract_text

'JOINT VENTURE AGREEMENT AMENDMENT \n \nThis Amendment ("Amendment") is made and entered into as of July 12, \n2025, by and between: \n \nCompany A: \nGoogle LLC \n1600 Amphitheatre Parkway \nMountain View, CA 94043 \n("Google") \nCompany B: \nMicrosoft Corporation \nOne Microsoft Way \nRedmond, WA 98052 \n("Microsoft") \n \nWHEREAS, Google and Microsoft (collectively, the "Parties") entered \ninto a Joint Venture Agreement dated July 12, 2024 (the "Original \nAgreement"); \n \nWHEREAS, the Parties wish to amend certain terms of the Original \nAgreement as provided herein. \n \nNOW, THEREFORE, in consideration of the mutual covenants and \npromises herein contained, the Joint Venture Agreement Parties agree as \nfollows: \n \n1. FORMATION OF JOINT VENTURE: \n1.1  Term:  The  term  of  the  Joint  Venture  is  hereby  extended  for  an  \nadditional  period  of  three(3) years, ending on July 12, 2032, unless \nterminated earlier in accordance with this Agreement. \n \n2. CONTRIBUTIONS:

In [21]:
split_text_into_paragraphs(first_contract_text)

['JOINT VENTURE AGREEMENT AMENDMENT',
 'This Amendment ("Amendment") is made and entered into as of July 12, \n2025, by and between:',
 'Company A: \nGoogle LLC \n1600 Amphitheatre Parkway \nMountain View, CA 94043 \n("Google") \nCompany B: \nMicrosoft Corporation \nOne Microsoft Way \nRedmond, WA 98052 \n("Microsoft")',
 'WHEREAS, Google and Microsoft (collectively, the "Parties") entered \ninto a Joint Venture Agreement dated July 12, 2024 (the "Original \nAgreement");',
 'WHEREAS, the Parties wish to amend certain terms of the Original \nAgreement as provided herein.',
 'NOW, THEREFORE, in consideration of the mutual covenants and \npromises herein contained, the Joint Venture Agreement Parties agree as \nfollows:',
 '1. FORMATION OF JOINT VENTURE: \n1.1  Term:  The  term  of  the  Joint  Venture  is  hereby  extended  for  an  \nadditional  period  of  three(3) years, ending on July 12, 2032, unless \nterminated earlier in accordance with this Agreement.',
 "2. CONTRIBUTIONS: \n2.1

In [10]:
# Extract text from the first contract
first_contract_text = extract_text_from_pdf(pdf_path_first_contract)

# Split text into paragraphs for the first contract
first_contract_paragraphs = split_text_into_paragraphs(first_contract_text)

# Annotate paragraphs with predicted clause types for the first contract
annotated_first_contract_paragraphs = annotate_paragraphs(first_contract_paragraphs)

In [11]:
# Extract text from the second contract
second_contract_text = extract_text_from_pdf(pdf_path_second_contract)

# Split text into paragraphs for the second contract
second_contract_paragraphs = split_text_into_paragraphs(second_contract_text)

# Annotate paragraphs with predicted clause types for the second contract
annotated_second_contract_paragraphs = annotate_paragraphs(second_contract_paragraphs)

In [12]:
type(annotated_second_contract_paragraphs)

list

In [13]:
# Print the annotated paragraphs for both contracts
print("Annotated Paragraphs for the First Contract:")
for idx, (text, clause_type) in enumerate(annotated_first_contract_paragraphs, 1):
    print(f"Paragraph {idx}:")
    print(f"Text: {text}")
    print(f"Predicted Clause Type: {clause_type}")
    print()

Annotated Paragraphs for the First Contract:
Paragraph 1:
Text: First Amendment to Restricted Stock Unit
Agreement (Strategic Growth PSUs) between
Intel and Patrick Gelsinger, dated November 18,
2022
Contract Categories: Business Finance - Stock Agreements
EX-10.2 3 d310344dex102.htm EX-10.2 EX-10.2
Exhibit 10.2
Strategic Growth PSUs
INTEL CORPORATION
2021 INDUCEMENT PLAN
FIRST AMENDMENT TO
RESTRICTED STOCK UNIT AGREEMENT
(for Performance-Based Restricted Stock Units (or “PSUs”))
This First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between
Patrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of 457,789
PSUs, effective as of February  15, 2021 (the “Strategic Growth PSU Award Agreement”), is made by and
between you and the Corporation, effective as of November  18, 2022 (the “Amendment Effective Date”).
Capitalized terms contained herein but not defined herein shall have the meanings ascribed to them in the 20

In [14]:
print("Annotated Paragraphs for the Second Contract:")
for idx, (text, clause_type) in enumerate(annotated_second_contract_paragraphs, 1):
    print(f"Paragraph {idx}:")
    print(f"Text: {text}")
    print(f"Predicted Clause Type: {clause_type}")
    print()

Annotated Paragraphs for the Second Contract:
Paragraph 1:
Text: First Amendment to Restricted Stock Unit
Agreement (Outperformance PSUs) between
Intel and Patrick Gelsinger, dated November 18,
2022
Contract Categories: Business Finance - Stock Agreements
EX-10.3 4 d310344dex103.htm EX-10.3 EX-10.3
Exhibit 10.3
Outperformance PSUs
INTEL CORPORATION
2021 INDUCEMENT PLAN
FIRST AMENDMENT TO
RESTRICTED STOCK UNIT AGREEMENT
(for Performance-Based Restricted Stock Units (or “PSUs”))
This First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between
Patrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of 3,275,199
PSUs, effective as of February  15, 2021 (the “Outperformance PSU Award Agreement”), is made by and
between you and the Corporation, effective as of November  18, 2022 (the “Amendment Effective Date”).
Capitalized terms contained herein but not defined herein shall have the meanings ascribed to them in the 2021


In [15]:
# Function to calculate cosine similarity between two texts
def calculate_similarity(text1, text2):
    cleaned_text1 = clean_text(text1)
    cleaned_text2 = clean_text(text2)
    vector1 = vectorizer.transform([cleaned_text1])
    vector2 = vectorizer.transform([cleaned_text2])
    similarity = cosine_similarity(vector1, vector2)[0][0]
    return similarity

In [16]:
annotated_first_contract_paragraphs

[('First Amendment to Restricted Stock Unit\nAgreement (Strategic Growth PSUs) between\nIntel and Patrick Gelsinger, dated November 18,\n2022\nContract Categories: Business Finance - Stock Agreements\nEX-10.2 3 d310344dex102.htm EX-10.2 EX-10.2\nExhibit 10.2\nStrategic Growth PSUs\nINTEL CORPORATION\n2021 INDUCEMENT PLAN\nFIRST AMENDMENT TO\nRESTRICTED STOCK UNIT AGREEMENT\n(for Performance-Based Restricted Stock Units (or “PSUs”))\nThis First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between\nPatrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of 457,789\nPSUs, effective as of February\xa0 15, 2021 (the “Strategic Growth PSU Award Agreement”), is made by and\nbetween you and the Corporation, effective as of November\xa0 18, 2022 (the “Amendment Effective Date”).\nCapitalized terms contained herein but not defined herein shall have the meanings ascribed to them in the 2021\nInducement Plan or the Strategic G

In [32]:
similar_paragraphs = []
max_len_first = len(annotated_first_contract_paragraphs)
max_len_second = len(annotated_second_contract_paragraphs)

# Calculate similarities and create a list of all paragraphs in serial order
for idx1 in range(max_len_first):
    for idx2 in range(max_len_second):
        similarity = calculate_similarity(annotated_first_contract_paragraphs[idx1][0], annotated_second_contract_paragraphs[idx2][0])
        highlighted_text1, highlighted_text2 = highlight_mismatches(annotated_first_contract_paragraphs[idx1][0], annotated_second_contract_paragraphs[idx2][0])
        if similarity > 0.5:
            similar_paragraphs.append((idx1 + 1, highlighted_text1, annotated_first_contract_paragraphs[idx1][1], idx2 + 1, highlighted_text2, annotated_second_contract_paragraphs[idx2][1], similarity))

# Add paragraphs from the first contract that have no matches
for idx1 in range(max_len_first):
    if not any(pair[0] == idx1 + 1 for pair in similar_paragraphs):
        similar_paragraphs.append((idx1 + 1, annotated_first_contract_paragraphs[idx1][0], annotated_first_contract_paragraphs[idx1][1], None, None, None, None))

# Sort the list to maintain serial order
similar_paragraphs.sort(key=lambda x: (x[0] if x[0] is not None else float('inf'), x[2] if x[2] is not None else float('inf')))

In [33]:
similar_paragraphs[1]

(2,
 'Removal of Interim Vesting Date; Change to Cliff-Vesting.',
 'Vesting',
 2,
 'Removal of Interim Vesting Date; Change to Cliff-Vesting.',
 'Vesting',
 1.0)

In [34]:
# Add paragraphs from the second contract that have no matches
second_non_matched_paras = []
for idx2 in range(max_len_second):
    if not any(pair[3] == idx2 + 1 for pair in similar_paragraphs):
        second_non_matched_paras.append((None, None, None, idx2 + 1, annotated_second_contract_paragraphs[idx2][0], annotated_second_contract_paragraphs[idx2][1], None))

In [35]:
second_non_matched_paras[1]

(None, None, None, 10, '/s/ Dion Weisler', 'Other', None)

In [36]:
similar_paragraphs

[(1,
  'First Amendment to Restricted Stock Unit Agreement <mark>(Strategic</mark> <mark>Growth</mark> PSUs) between Intel and Patrick Gelsinger, dated November 18, 2022 Contract Categories: Business Finance - Stock Agreements <mark>EX-10.2</mark> <mark>3</mark> <mark>d310344dex102.htm</mark> <mark>EX-10.2</mark> <mark>EX-10.2</mark> Exhibit <mark>10.2</mark> <mark>Strategic</mark> <mark>Growth</mark> PSUs INTEL CORPORATION 2021 INDUCEMENT PLAN FIRST AMENDMENT TO RESTRICTED STOCK UNIT AGREEMENT (for Performance-Based Restricted Stock Units (or “PSUs”)) This First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between Patrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of <mark>457,789</mark> PSUs, effective as of February 15, 2021 (the <mark>“Strategic</mark> <mark>Growth</mark> PSU Award Agreement”), is made by and between you and the Corporation, effective as of November 18, 2022 (the “Amendment Effective Date”

In [37]:
for index in range(len(second_non_matched_paras)):
    inserted = False
    for i in range(len(similar_paragraphs)):
        # Skip if similar_paragraphs[i][2] is None
        if similar_paragraphs[i][3] is None:
            continue
        
        # Insert if the current item's index is less than the one in similar_paragraphs
        if second_non_matched_paras[index][3] < similar_paragraphs[i][3]:
            similar_paragraphs.insert(i, second_non_matched_paras[index])
            inserted = True
            break
    
    # If not inserted, append to the end
    if not inserted:
        similar_paragraphs.append(second_non_matched_paras[index])

In [38]:
similar_paragraphs

[(1,
  'First Amendment to Restricted Stock Unit Agreement <mark>(Strategic</mark> <mark>Growth</mark> PSUs) between Intel and Patrick Gelsinger, dated November 18, 2022 Contract Categories: Business Finance - Stock Agreements <mark>EX-10.2</mark> <mark>3</mark> <mark>d310344dex102.htm</mark> <mark>EX-10.2</mark> <mark>EX-10.2</mark> Exhibit <mark>10.2</mark> <mark>Strategic</mark> <mark>Growth</mark> PSUs INTEL CORPORATION 2021 INDUCEMENT PLAN FIRST AMENDMENT TO RESTRICTED STOCK UNIT AGREEMENT (for Performance-Based Restricted Stock Units (or “PSUs”)) This First Amendment (this “Amendment”) to the Restricted Stock Unit Agreement by and between Patrick Gelsinger (“you”) and Intel Corporation (the “Corporation”), which provided for the grant of <mark>457,789</mark> PSUs, effective as of February 15, 2021 (the <mark>“Strategic</mark> <mark>Growth</mark> PSU Award Agreement”), is made by and between you and the Corporation, effective as of November 18, 2022 (the “Amendment Effective Date”

In [39]:
# # Generate a report with highlighted differences
# highlight_mismatches(similar_paragraphs[0][1], similar_paragraphs[0][4])[1]