In [1]:
pip install spacy



Extract keywords

In [None]:
# # Step 1: Install spaCy and download English model
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

# Step 2: Import required libraries
import spacy
import xml.etree.ElementTree as ET
from google.colab import files

# Step 3: Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Step 4: Read the file named teext.txt
with open("ISOIEC19086-22018.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Step 5: Process the text with spaCy
doc = nlp(text)

# Step 6: Extract keywords (nouns and proper nouns, excluding stop words and punctuation)
keywords = {token.text.lower() for token in doc if token.pos_ in ("NOUN", "PROPN") and not token.is_stop and token.is_alpha}

# Step 7: Build XML structure
root = ET.Element("keywords")
for kw in sorted(keywords):
    ET.SubElement(root, "keyword").text = kw

# Step 8: Write to XML file
tree = ET.ElementTree(root)
output_filename = "keywords.xml"
tree.write(output_filename, encoding="utf-8", xml_declaration=True)

# Step 9: Download the XML file
files.download(output_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**OUTPUT:**

```
<keywords>
<keyword>accordance</keyword>
<keyword>addresses</keyword>
<keyword>adherence</keyword>
<keyword>agreement</keyword>
<keyword>agreements</keyword>
<keyword>ambiguities</keyword>
<keyword>amendments</keyword>
<keyword>applications</keyword>
<keyword>approach</keyword>
<keyword>approval</keyword>
<keyword>assessment</keyword>
<keyword>attention</keyword>
<keyword>availability</keyword>
<keyword>barriers</keyword>
<keyword>benefit</keyword>
<keyword>bodies</keyword>
<keyword>body</keyword>
<keyword>business</keyword>
<keyword>cases</keyword>
<keyword>challenges</keyword>
<keyword>characteristic</keyword>
<keyword>clarity</keyword>
```



extract entities: Named Entity Recognition (NER) feature.

🔍 spaCy NER Entities Examples:
PERSON – People names

ORG – Organizations

GPE – Countries, cities, states

DATE, TIME, MONEY, etc.

In [None]:
# Step 3: Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Step 4: Read the file named teext.txt
with open("ISOIEC19086-22018.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Step 5: Process the text with spaCy
doc = nlp(text)

# Step 6: Extract named entities (grouped by label)
entities_by_label = {}
for ent in doc.ents:
    label = ent.label_
    if label not in entities_by_label:
        entities_by_label[label] = set()
    entities_by_label[label].add(ent.text.strip())

# Step 7: Build XML structure
root = ET.Element("named_entities")

for label, entities in sorted(entities_by_label.items()):
    label_elem = ET.SubElement(root, label)
    for entity in sorted(entities):
        ET.SubElement(label_elem, "entity").text = entity

# Step 8: Write to XML file
tree = ET.ElementTree(root)
output_filename = "entities.xml"
tree.write(output_filename, encoding="utf-8", xml_declaration=True)

# Step 9: Download the XML file
files.download(output_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Output:**

```
<named_entities>
<CARDINAL>
<entity>1</entity>
<entity>1 to 5</entity>
<entity>2</entity>
<entity>3</entity>
<entity>3.1</entity>
<entity>3.10</entity>
<entity>3.2</entity>
<entity>3.3</entity>
<entity>3.4</entity>
<entity>3.5</entity>
<entity>3.6</entity>
<entity>3.7</entity>
<entity>one</entity>
</CARDINAL>
<DATE>
<entity>19086</entity>
<entity>19086-1</entity>
<entity>19086-1:2016</entity>
<entity>19086-3</entity>
<entity>19086-4</entity>
<entity>October 2004</entity>
</DATE>
<LANGUAGE>
<entity>English</entity>
</LANGUAGE>
<ORDINAL>
<entity>first</entity>
<entity>second</entity>
</ORDINAL>
<ORG>
<entity>CSP</entity>
<entity>Foreword ISO</entity>
<entity>IEC</entity>
<entity>IEC Electropedia</entity>
<entity>ISO</entity>
<entity>ISO/IEC</entity>
<entity>ISO/IEC 17788</entity>
<entity>ISO/IEC 80000-1:2009</entity>
<entity>ISO/IEC JTC 1</entity>
<entity>ISO/IEC/IEEE</entity>
<entity>ITU</entity>
<entity>International Standards</entity>
<entity>Subcommittee SC 38</entity>
<entity>Technical Committee ISO/IEC JTC1, Information technology</entity>
<entity>WTO</entity>
<entity>http://www.iso.org/obp</entity>
<entity>the International Organization for Standardization</entity>
<entity>the Technical Barriers to Trade (TBT</entity>
<entity>the World Trade Organization</entity>
</ORG>
<PERCENT>
<entity>100 %</entity>
</PERCENT>
<PERSON>
<entity>Cloud Computing</entity>
<entity>XML Schema Part</entity>
</PERSON>
<PRODUCT>
<entity>SOs</entity>
</PRODUCT>
<QUANTITY>
<entity>3.6 metric</entity>
</QUANTITY>
<WORK_OF_ART>
<entity>Datatypes Second Edition</entity>
</WORK_OF_ART>
</named_entities>
```



In [None]:
import re
import xml.etree.ElementTree as ET

# Load the text
with open("ISOIEC19086-22018.txt", "r") as file:
    lines = file.readlines()

# Create root element for XML
root = ET.Element("requirements")

# Define basic patterns to match requirements
patterns = [
    (r"latency.*?(less than|under|<)\s*(\d+\.?\d*)\s*ms", "latency"),
    (r"availability.*?(\d+\.?\d*)\s*%", "availability"),
    (r"jitter.*?(up to|less than|<)\s*(\d+\.?\d*)\s*ms", "jitter")
]

# Process each line
for line in lines:
    print(f"Processing line: {line.strip()}")
    for pattern, req_type in patterns:
        match = re.search(pattern, line, re.IGNORECASE)
        if match:
            print(f" Matched {req_type}: {match.groupdict()}")
            req_elem = ET.SubElement(root, "requirement")
            ET.SubElement(req_elem, "type").text = req_type

            # Safely extract named groups
            value = match.groupdict().get("value")
            operator = match.groupdict().get("operator")

            if value:
                ET.SubElement(req_elem, "constraint").text = value
            else:
                print(" No 'value' group found for match:", match.group())

            if operator:
                req_elem.set("operator", operator)

# Save to XML
tree = ET.ElementTree(root)
tree.write("output.xml", encoding="utf-8", xml_declaration=True)

print(" XML generated as 'output.xml'")


Processing line: Foreword
Processing line: ISO (the International Organization for Standardization) is a worldwide federation of national standards bodies (ISO member bodies). The work of preparing International Standards is normally carried out through ISO technical committees. Each member body interested in a subject for which a technical committee has been established has the right to be represented on that committee. International organizations, governmental and non-governmental, in liaison with ISO, also take part in the work. In the field of information technology, ISO and IEC have established a joint technical committee, ISO/IEC JTC 1
Processing line: The procedures used to develop this document and those intended for its further maintenance are described in the ISO/IEC Directives, Part 1. In particular, the different approval criteria needed for the different types of ISO documents should be noted. This document was drafted in accordance with the editorial rules of the ISO/IEC 

**Output**

```
<requirements>
<requirement>
<type>availability</type>
</requirement>
</requirements>
```



NER with SLAtext

In [None]:

# Step 2: Import required libraries
import spacy
import xml.etree.ElementTree as ET
from google.colab import files
# Step 3: Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Step 4: Read the file named teext.txt
with open("SLAtext.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Step 5: Process the text with spaCy
doc = nlp(text)

# Step 6: Extract named entities (grouped by label)
entities_by_label = {}
for ent in doc.ents:
    label = ent.label_
    if label not in entities_by_label:
        entities_by_label[label] = set()
    entities_by_label[label].add(ent.text.strip())

# Step 7: Build XML structure
root = ET.Element("named_entities")

for label, entities in sorted(entities_by_label.items()):
    label_elem = ET.SubElement(root, label)
    for entity in sorted(entities):
        ET.SubElement(label_elem, "entity").text = entity

# Step 8: Write to XML file
tree = ET.ElementTree(root)
output_filename = "entitiesSLA.xml"
tree.write(output_filename, encoding="utf-8", xml_declaration=True)

# Step 9: Download the XML file
files.download(output_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Step 3: Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Step 4: Read the file named teext.txt
with open("SLAtext.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Step 5: Process the text with spaCy
doc = nlp(text)

# Step 6: Extract keywords (nouns and proper nouns, excluding stop words and punctuation)
keywords = {token.text.lower() for token in doc if token.pos_ in ("NOUN", "PROPN") and not token.is_stop and token.is_alpha}

# Step 7: Build XML structure
root = ET.Element("keywords")
for kw in sorted(keywords):
    ET.SubElement(root, "keyword").text = kw

# Step 8: Write to XML file
tree = ET.ElementTree(root)
output_filename = "keywordsSLA.xml"
tree.write(output_filename, encoding="utf-8", xml_declaration=True)

# Step 9: Download the XML file
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Example**

In [2]:
import spacy

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')  # You can choose a different model as needed


sla_text = """
The service provider shall ensure 99.9% uptime for the Data Backup service.
In case of downtime, the response time should not exceed 4 hours.
After execution of this Agreement, "Dan" shall pay the full purchase price to "Jerome" in the amount of 3.14 EUR upon demand by "Jerome".
In case of delayed delivery except for Force Majeure cases, "Dan" (the Seller) shall pay to "Steve" (the Buyer) for every 2 days of delay penalty amounting to 10.5% of the total value of the Equipment whose delivery has been delayed. Any fractional part of a days is to be considered a full days. The total amount of penalty shall not however, exceed 55% of the total value of the Equipment involved in late delivery. If the delay is more than 15 days, the Buyer is entitled to terminate this Contract. All Equipment values are based on EUR and all penalty payments will be paid in USD at its equivalent amount in EUR.
The conversion rate between the currencies is based upon "the prevailing exchange rate at a major United States bank".
"""

# Process the SLA text
doc = nlp(sla_text)

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Dependency: {token.dep_}")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Token: 
, POS: SPACE, Dependency: dep
Token: The, POS: DET, Dependency: det
Token: service, POS: NOUN, Dependency: compound
Token: provider, POS: NOUN, Dependency: nsubj
Token: shall, POS: AUX, Dependency: aux
Token: ensure, POS: VERB, Dependency: ROOT
Token: 99.9, POS: NUM, Dependency: nummod
Token: %, POS: NOUN, Dependency: compound
Token: uptime, POS: NOUN, Dependency: dobj
Token: for, POS: ADP, Dependency: prep
Token: the, POS: DET, Dependency: det
Token: Data, POS: PROPN, Dependency: compound
Token: Backup, POS: PROPN, Dependency: compound
Token: service, POS: NOUN, Dependency: pobj
Token: ., POS: PUNCT, Dependency: punct
Token: 
, POS: SPACE, Dependency: dep
Token: In, POS: ADP, Dependency: prep
Token: case, POS: NOUN, Dependency: pobj
Token: of, POS: ADP, Dependency: prep
Token: downtime, POS: NOUN, Dependency: pobj
Token: ,, POS: PUNCT, Dependency: punct
Token: the, POS: DET, Dependency: det
Token: response, POS: NOUN, Dependency: compound
Token: time, POS: NOUN, Dependency: ns

In [5]:
keywords = {"latency", "throughput", "reliability", "availability", "jitter", "packet", "loss", "qos", "acceptance", "notifies", "Business Days", "requirements", "obligations", "agreement" }

for token in doc:
    if token.text.lower() in keywords:
        print(f"\nKeyword: {token.text}")
        for child in token.children:
            print(f"  -> Child: {child.text} ({child.dep_}, {child.pos_})")
        for ancestor in token.ancestors:
            print(f"  <- Ancestor: {ancestor.text} ({ancestor.dep_}, {ancestor.pos_})")



Keyword: Agreement
  -> Child: this (det, DET)
  <- Ancestor: of (prep, ADP)
  <- Ancestor: execution (pobj, NOUN)
  <- Ancestor: After (prep, ADP)
  <- Ancestor: pay (ROOT, VERB)


In [6]:
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: 99.9%, Label: PERCENT
Entity: Data Backup, Label: ORG
Entity: 4 hours, Label: TIME
Entity: Dan, Label: PERSON
Entity: Jerome, Label: WORK_OF_ART
Entity: 3.14, Label: CARDINAL
Entity: EUR, Label: ORG
Entity: Jerome, Label: PERSON
Entity: Force Majeure, Label: ORG
Entity: Dan, Label: PERSON
Entity: Seller, Label: PERSON
Entity: Steve, Label: PERSON
Entity: Buyer, Label: PERSON
Entity: every 2 days, Label: DATE
Entity: 10.5%, Label: PERCENT
Entity: a full days, Label: DATE
Entity: 55%, Label: PERCENT
Entity: more than 15 days, Label: DATE
Entity: Buyer, Label: PERSON
Entity: EUR, Label: ORG
Entity: USD, Label: ORG
Entity: EUR, Label: ORG
Entity: United States, Label: GPE


In [13]:
#Use spaCy's Matcher to define patterns that match
#specific phrases or structures within the text, such as service names, availability percentages, and response times.


from spacy.matcher import Matcher

# Initialize the matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Define patterns for availability and response time
availability_pattern = [{'LIKE_NUM': True}, {'TEXT': '%'}, {'LOWER': 'uptime'}]
response_time_pattern = [{'LOWER': 'response'}, {'LOWER': 'time'}, {'LOWER': 'should'}, {'LOWER': 'not'}, {'LOWER': 'exceed'}, {'LIKE_NUM': True}, {'LOWER': 'hours'}]

# Add patterns to the matcher
matcher.add('AVAILABILITY', [availability_pattern])
matcher.add('RESPONSE_TIME', [response_time_pattern])

# Apply the matcher to the doc
matches = matcher(doc)

# Initialize variables
availability = None
response_time = None

# Extract matched spans
for match_id, start, end in matches:
    span = doc[start:end]
    match_label = nlp.vocab.strings[match_id]
    if match_label == 'AVAILABILITY':
        availability = span.text
    elif match_label == 'RESPONSE_TIME':
        response_time = span.text

print(f"Availability: {availability}")
print(f"Response Time: {response_time}")



Availability: 99.9% uptime
Response Time: response time should not exceed 4 hours


In [20]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Existing patterns
availability_pattern = [{'LIKE_NUM': True}, {'TEXT': '%'}, {'LOWER': 'uptime'}]
response_time_pattern = [{'LOWER': 'response'}, {'LOWER': 'time'}, {'LOWER': 'should'}, {'LOWER': 'not'}, {'LOWER': 'exceed'}, {'LIKE_NUM': True}, {'LOWER': 'hours'}]

matcher.add('AVAILABILITY', [availability_pattern])
matcher.add('RESPONSE_TIME', [response_time_pattern])

# New keyword patterns
keywords = [
    "latency", "throughput", "reliability", "availability", "jitter",
    "packet", "loss", "qos", "acceptance", "notifies", "name", "license", "agreement", "sla", "licensor", "company",
    "business days", "requirements", "obligations", "usd", "dollars", "date"
]

for keyword in keywords:
    words = keyword.lower().split()
    pattern = [{'LOWER': word.strip(",")} for word in words]
    matcher.add(keyword.upper().replace(" ", "_"), [pattern])

# Apply the matcher
matches = matcher(doc)

# Print matches
for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Matched '{span.text}' with label '{nlp.vocab.strings[match_id]}' at position {start}-{end}")

Matched '99.9% uptime' with label 'AVAILABILITY' at position 6-9
Matched 'response time should not exceed 4 hours' with label 'RESPONSE_TIME' at position 22-29
Matched 'Agreement' with label 'AGREEMENT' at position 35-36
Matched 'USD' with label 'USD' at position 187-188


In [24]:
# Optional: define custom overrides for certain proper nouns
custom_lemma_map = {
    "Licensor": "license",
    "Licensee": "license"
}

# Print tokens and their lemmas
for token in doc:
    if token.text in custom_lemma_map:
        lemma = custom_lemma_map[token.text]
    else:
        lemma = token.lemma_
    print(f"Token: {token.text} -> Lemma: {lemma}")


Token: 
 -> Lemma: 

Token: The -> Lemma: the
Token: service -> Lemma: service
Token: provider -> Lemma: provider
Token: shall -> Lemma: shall
Token: ensure -> Lemma: ensure
Token: 99.9 -> Lemma: 99.9
Token: % -> Lemma: %
Token: uptime -> Lemma: uptime
Token: for -> Lemma: for
Token: the -> Lemma: the
Token: Data -> Lemma: Data
Token: Backup -> Lemma: Backup
Token: service -> Lemma: service
Token: . -> Lemma: .
Token: 
 -> Lemma: 

Token: In -> Lemma: in
Token: case -> Lemma: case
Token: of -> Lemma: of
Token: downtime -> Lemma: downtime
Token: , -> Lemma: ,
Token: the -> Lemma: the
Token: response -> Lemma: response
Token: time -> Lemma: time
Token: should -> Lemma: should
Token: not -> Lemma: not
Token: exceed -> Lemma: exceed
Token: 4 -> Lemma: 4
Token: hours -> Lemma: hour
Token: . -> Lemma: .
Token: 
 -> Lemma: 

Token: After -> Lemma: after
Token: execution -> Lemma: execution
Token: of -> Lemma: of
Token: this -> Lemma: this
Token: Agreement -> Lemma: agreement
Token: , -> Lemma

In [21]:
# Structure the extracted information into an XML format using Python's xml.etree.ElementTree module.

import xml.etree.ElementTree as ET

# Create the root element
sla = ET.Element('SLA')

# Create a service element
service = ET.SubElement(sla, 'Service')

# Add service name
service_name = ET.SubElement(service, 'Name')
service_name.text = 'Data Backup'  # This can be extracted similarly using spaCy

# Add availability
if availability:
    availability_elem = ET.SubElement(service, 'Availability')
    availability_elem.text = availability

# Add response time
if response_time:
    response_time_elem = ET.SubElement(service, 'ResponseTime')
    response_time_elem.text = response_time

# Generate the XML string
xml_str = ET.tostring(sla, encoding='unicode')
print(xml_str)


<SLA><Service><Name>Data Backup</Name><Availability>99.9% uptime</Availability><ResponseTime>response time should not exceed 4 hours</ResponseTime></Service></SLA>


In [22]:
# Assuming xml_str contains your XML content as a string
with open('slaexample_output.xml', 'w', encoding='utf-8') as f:
    f.write(xml_str)


In [23]:
import xml.etree.ElementTree as ET

# Assuming 'sla' is your root Element
tree = ET.ElementTree(sla)
tree.write('slaexample_output.xml', encoding='utf-8', xml_declaration=True)


# **Spacy on accord project**

In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')  # You can choose a different model as needed


sla_text = """
Acceptance of Delivery.
"Party A" will be deemed to have completed its delivery obligations if in "Party B"'s opinion, the "Widgets" satisfies the Acceptance Criteria, and "Party B" notifies "Party A" in writing that it is accepting the "Widgets".

Inspection and Notice.
"Party B" will have 10 Business Days to inspect and evaluate the "Widgets" on the delivery date before notifying "Party A" that it is either accepting or rejecting the "Widgets".

Acceptance Criteria.
The "Acceptance Criteria" are the specifications the "Widgets" must meet for "Party A" to comply with its requirements and obligations under this agreement, detailed in "Attachment X", attached to this agreement.
"""

# Process the SLA text
doc = nlp(sla_text)

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}, Dependency: {token.dep_}")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Token: 
, POS: SPACE, Dependency: dep
Token: Acceptance, POS: PROPN, Dependency: ROOT
Token: of, POS: ADP, Dependency: prep
Token: Delivery, POS: PROPN, Dependency: pobj
Token: ., POS: PUNCT, Dependency: punct
Token: 
, POS: SPACE, Dependency: dep
Token: ", POS: PUNCT, Dependency: punct
Token: Party, POS: PROPN, Dependency: compound
Token: A, POS: PROPN, Dependency: nsubjpass
Token: ", POS: PUNCT, Dependency: punct
Token: will, POS: AUX, Dependency: aux
Token: be, POS: AUX, Dependency: auxpass
Token: deemed, POS: VERB, Dependency: ccomp
Token: to, POS: PART, Dependency: aux
Token: have, POS: AUX, Dependency: aux
Token: completed, POS: VERB, Dependency: xcomp
Token: its, POS: PRON, Dependency: poss
Token: delivery, POS: NOUN, Dependency: compound
Token: obligations, POS: NOUN, Dependency: dobj
Token: if, POS: SCONJ, Dependency: mark
Token: in, POS: ADP, Dependency: prep
Token: ", POS: PUNCT, Dependency: punct
Token: Party, POS: PROPN, Dependency: compound
Token: B, POS: PROPN, Dependenc

In [None]:
keywords = {"latency", "throughput", "reliability", "availability", "jitter", "packet", "loss", "qos", "acceptance", "notifies", "Business Days", "requirements", "obligations", "agreement" }

for token in doc:
    if token.text.lower() in keywords:
        print(f"\nKeyword: {token.text}")
        for child in token.children:
            print(f"  -> Child: {child.text} ({child.dep_}, {child.pos_})")
        for ancestor in token.ancestors:
            print(f"  <- Ancestor: {ancestor.text} ({ancestor.dep_}, {ancestor.pos_})")



Keyword: Acceptance
  -> Child: 
 (dep, SPACE)
  -> Child: of (prep, ADP)
  -> Child: . (punct, PUNCT)

Keyword: obligations
  -> Child: its (poss, PRON)
  -> Child: delivery (compound, NOUN)
  <- Ancestor: completed (xcomp, VERB)
  <- Ancestor: deemed (ccomp, VERB)
  <- Ancestor: satisfies (ROOT, VERB)

Keyword: Acceptance
  <- Ancestor: Criteria (dobj, PROPN)
  <- Ancestor: satisfies (ROOT, VERB)

Keyword: notifies
  -> Child: " (punct, PUNCT)
  -> Child: B (nmod, PROPN)
  -> Child: " (punct, PUNCT)
  -> Child: " (punct, PUNCT)
  -> Child: A (appos, PROPN)
  -> Child: " (punct, PUNCT)
  <- Ancestor: satisfies (ROOT, VERB)

Keyword: Acceptance
  <- Ancestor: Criteria (pobj, PROPN)
  <- Ancestor: . (punct, PUNCT)
  <- Ancestor: have (ROOT, VERB)

Keyword: Acceptance
  <- Ancestor: Criteria (nsubj, PROPN)
  <- Ancestor: are (ROOT, AUX)

Keyword: requirements
  -> Child: its (poss, PRON)
  -> Child: and (cc, CCONJ)
  -> Child: obligations (conj, NOUN)
  -> Child: under (prep, ADP)
  <- 

In [None]:
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: the Acceptance Criteria, Label: ORG
Entity: Notice, Label: ORG
Entity: 10 Business Days, Label: DATE
Entity: Party A, Label: WORK_OF_ART
Entity: The "Acceptance Criteria, Label: WORK_OF_ART
Entity: Attachment X, Label: WORK_OF_ART


In [None]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
terms = ["service uptime", "maximum", "delivery", "service"]
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("SLA_TERMS", patterns)

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Matched phrase: {span.text}")


Matched phrase: delivery
Matched phrase: delivery


# **large English NLP model provided by spaCy**

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# nlp = spacy.load('en_core_web_lg')  # Must use a large model with word vectors
# target = nlp("latency")[0]

# for token in doc:
#     if token.has_vector and token.similarity(target) > 0.6:
#         print(f"Related to latency: {token.text}, Similarity: {token.similarity(target):.2f}")

# Step 3: Now your similarity code will work
doc = nlp("The latency must be below 5 milliseconds to meet SLA.")
target = nlp("latency")[0]

for token in doc:
    if token.has_vector and token.similarity(target) > 0.6:
        print(f"Related to latency: {token.text}, Similarity: {token.similarity(target):.2f}")


Related to latency: latency, Similarity: 1.00


In [None]:
import spacy

# Load the large English model with word vectors
nlp = spacy.load("en_core_web_lg")

# SLA-related keywords to compare against
keywords = [
    "latency", "throughput", "reliability", "availability", "jitter",
    "packet", "loss", "qos", "acceptance", "notifies",
    "Business Days", "requirements", "obligations", "agreement"
]

# Convert keywords to spaCy tokens (handle phrases properly)
keyword_tokens = {kw: nlp(kw)[0] if len(nlp(kw)) == 1 else nlp(kw) for kw in keywords}

# Text to process
doc = nlp("""
# The latency must be below 5 milliseconds to meet SLA.
# Party B will have 10 Business Days to evaluate the service availability and notify Party A.
Acceptance of Delivery.
"Party A" will be deemed to have completed its delivery obligations if in "Party B"'s opinion, the "Widgets" satisfies the Acceptance Criteria, and "Party B" notifies "Party A" in writing that it is accepting the "Widgets".

Inspection and Notice.
"Party B" will have 10 Business Days to inspect and evaluate the "Widgets" on the delivery date before notifying "Party A" that it is either accepting or rejecting the "Widgets".

Acceptance Criteria.
The "Acceptance Criteria" are the specifications the "Widgets" must meet for "Party A" to comply with its requirements and obligations under this agreement, detailed in "Attachment X", attached to this agreement.
""")

# Compare each token in the doc to each keyword
for token in doc:
    if token.has_vector:
        for kw, kw_token in keyword_tokens.items():
            if isinstance(kw_token, spacy.tokens.Token):
                # Single-word keyword
                similarity = token.similarity(kw_token)
            else:
                # Multi-word keyword (e.g., "Business Days") — average similarity
                similarity = sum(token.similarity(t) for t in kw_token) / len(kw_token)

            if similarity > 0.6:
                print(f"Doc token '{token.text}' is similar to keyword '{kw}' (Similarity: {similarity:.2f})")


Doc token 'latency' is similar to keyword 'latency' (Similarity: 1.00)
Doc token 'latency' is similar to keyword 'throughput' (Similarity: 0.63)
Doc token 'latency' is similar to keyword 'jitter' (Similarity: 0.65)
Doc token 'Business' is similar to keyword 'Business Days' (Similarity: 0.65)
Doc token 'Days' is similar to keyword 'Business Days' (Similarity: 0.65)
Doc token 'availability' is similar to keyword 'availability' (Similarity: 1.00)
Doc token 'notify' is similar to keyword 'notifies' (Similarity: 0.75)
Doc token 'Acceptance' is similar to keyword 'acceptance' (Similarity: 1.00)
Doc token 'obligations' is similar to keyword 'obligations' (Similarity: 1.00)
Doc token 'Acceptance' is similar to keyword 'acceptance' (Similarity: 1.00)
Doc token 'Criteria' is similar to keyword 'requirements' (Similarity: 0.67)
Doc token 'notifies' is similar to keyword 'notifies' (Similarity: 1.00)
Doc token 'accepting' is similar to keyword 'acceptance' (Similarity: 0.68)
Doc token 'Business' i

In [None]:
import spacy

# Load the large English model with word vectors
nlp = spacy.load("en_core_web_lg")

# SLA-related keywords to compare against
keywords = [
    "latency", "throughput", "reliability", "availability", "jitter",
    "packet", "loss", "qos", "acceptance", "notifies",
    "Business Days", "requirements", "obligations", "agreement"
]

# Convert keywords to spaCy tokens (handle phrases properly)
keyword_tokens = {kw: nlp(kw)[0] if len(nlp(kw)) == 1 else nlp(kw) for kw in keywords}

# Text to process
Acceptance_of_Delivery_Clause = nlp("""
# The latency must be below 5 milliseconds to meet SLA.
# Party B will have 10 Business Days to evaluate the service availability and notify Party A.
Acceptance of Delivery.
"Party A" will be deemed to have completed its delivery obligations if in "Party B"'s opinion, the "Widgets" satisfies the Acceptance Criteria, and "Party B" notifies "Party A" in writing that it is accepting the "Widgets".

Inspection and Notice.
"Party B" will have 10 Business Days to inspect and evaluate the "Widgets" on the delivery date before notifying "Party A" that it is either accepting or rejecting the "Widgets".

Acceptance Criteria.
The "Acceptance Criteria" are the specifications the "Widgets" must meet for "Party A" to comply with its requirements and obligations under this agreement, detailed in "Attachment X", attached to this agreement.
""")

# Compare each token in the doc Acceptance_of_Delivery_Clause to each keyword
for token in Acceptance_of_Delivery_Clause:
    if token.has_vector:
        for kw, kw_token in keyword_tokens.items():
            if isinstance(kw_token, spacy.tokens.Token):
                # Single-word keyword
                similarity = token.similarity(kw_token)
            else:
                # Multi-word keyword (e.g., "Business Days") — average similarity
                similarity = sum(token.similarity(t) for t in kw_token) / len(kw_token)

            if similarity > 0.6:
                print(f"Doc token '{token.text}' is similar to keyword '{kw}' (Similarity: {similarity:.2f})")


Doc token 'latency' is similar to keyword 'latency' (Similarity: 1.00)
Doc token 'latency' is similar to keyword 'throughput' (Similarity: 0.63)
Doc token 'latency' is similar to keyword 'jitter' (Similarity: 0.65)
Doc token 'Business' is similar to keyword 'Business Days' (Similarity: 0.65)
Doc token 'Days' is similar to keyword 'Business Days' (Similarity: 0.65)
Doc token 'availability' is similar to keyword 'availability' (Similarity: 1.00)
Doc token 'notify' is similar to keyword 'notifies' (Similarity: 0.75)
Doc token 'Acceptance' is similar to keyword 'acceptance' (Similarity: 1.00)
Doc token 'obligations' is similar to keyword 'obligations' (Similarity: 1.00)
Doc token 'Acceptance' is similar to keyword 'acceptance' (Similarity: 1.00)
Doc token 'Criteria' is similar to keyword 'requirements' (Similarity: 0.67)
Doc token 'notifies' is similar to keyword 'notifies' (Similarity: 1.00)
Doc token 'accepting' is similar to keyword 'acceptance' (Similarity: 0.68)
Doc token 'Business' i

In [None]:
import spacy
import xml.etree.ElementTree as ET

# Load the large English model with word vectors
nlp = spacy.load("en_core_web_lg")

# SLA-related keywords to compare against
keywords = [
    "latency", "throughput", "reliability", "availability", "jitter",
    "packet", "loss", "qos", "acceptance", "notifies", "name", "License", "agreement", "SLA", "Licensor", "Company",
    "Business Days", "requirements", "obligations", "agreement", "usd", "dollars", "date"
]

# Convert keywords to spaCy tokens (handle phrases properly)
keyword_tokens = {kw: nlp(kw)[0] if len(nlp(kw)) == 1 else nlp(kw) for kw in keywords}

# Text to process
Copyright_Clause = nlp("""
Copyright License Agreement

This COPYRIGHT LICENSE AGREEMENT (the "Agreement"), dated as of 01/01/2018 (the "Effective Date"), is made by and between "Me" ("Licensee"), a "NY" "Company" with offices located at "1 Broadway", and "Myself" ("Licensor"), a "NY" "Company" with offices located at "2 Broadway".

WHEREAS, Licensor solely and exclusively owns or controls the Work (as defined below) and wishes to grant to Licensee a license to the Work, and Licensee wishes to obtain a license to the Work for the uses and purposes described herein, each subject to the terms and conditions set forth herein.

NOW, THEREFORE, in consideration of the mutual covenants, terms, and conditions set forth herein, and for other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the parties agree as follows:

License.

Grant of Rights. Subject to the terms and conditions of this Agreement, Licensor hereby grants to Licensee and its affiliates during the Term (as defined below) an exclusive, transferable right and license in the "United States" (the "Territory"), to reproduce, publicly perform, display, transmit, and distribute the Work, including translate, alter, modify, and create derivative works of the Work, through all media now known or hereinafter developed for purposes of "stuff". The "Work" is defined as "other stuff".

Permissions. Licensor has obtained from all persons and entities who are, or whose trademark or other property is, identified, depicted, or otherwise referred to in the Work, such written and signed licenses, permissions, waivers, and consents (collectively, "Permissions" and each, individually, a "Permission"), including those relating to publicity, privacy, and any intellectual property rights, as are or reasonably may be expected to be necessary for Licensee to exercise its rights in the Work as permitted under this Agreement, without incurring any payment or other obligation to, or otherwise violating any right of, any such person or entity.

Copyright Notices. Licensee shall ensure that its use of the Work is marked with the appropriate copyright notices specified by Licensor in a reasonably prominent position in the order and manner provided by Licensor. Licensee shall abide by the copyright laws and what are considered to be sound practices for copyright notice provisions in the Territory. Licensee shall not use any copyright notices that conflict with, confuse, or negate the notices Licensor provides and requires hereunder.

{{#clause paymentClause}} Payment. As consideration in full for the rights granted herein, Licensee shall pay Licensor a one-time fee in the amount of "one hundred US Dollars" (100.0 USD) upon execution of this Agreement, payable as follows: "bank transfer". {{/clause}}

General.

Interpretation. For purposes of this Agreement, (a) the words "include," "includes," and "including" are deemed to be followed by the words "without limitation"; (b) the word "or" is not exclusive; and (c) the words "herein," "hereof," "hereby," "hereto," and "hereunder" refer to this Agreement as a whole. This Agreement is intended to be construed without regard to any presumption or rule requiring construction or interpretation against the party drafting an instrument or causing any instrument to be drafted.

Entire Agreement. This Agreement, including and together with any related attachments, constitutes the sole and entire agreement of the parties with respect to the subject matter contained herein, and supersedes all prior and contemporaneous understandings, agreements, representations, and warranties, both written and oral, with respect to such subject matter.

Severability. If any term or provision of this Agreement is invalid, illegal, or unenforceable in any jurisdiction, such invalidity, illegality, or unenforceability will not affect the enforceability of any other term or provision of this Agreement, or invalidate or render unenforceable such term or provision in any other jurisdiction. [Upon a determination that any term or provision is invalid, illegal, or unenforceable, [the parties shall negotiate in good faith to/the court may] modify this Agreement to effect the original intent of the parties as closely as possible in order that the transactions contemplated hereby be consummated as originally contemplated to the greatest extent possible.]

Assignment. Licensee may freely assign or otherwise transfer all or any of its rights, or delegate or otherwise transfer all or any of its obligations or performance, under this Agreement without Licensor's consent. This Agreement is binding upon and inures to the benefit of the parties hereto and their respective permitted successors and assigns.
""")

# Compare each token in the doc Copyright_Clause to each keyword
for token in Copyright_Clause:
    if token.has_vector:
        for kw, kw_token in keyword_tokens.items():
            if isinstance(kw_token, spacy.tokens.Token):
                # Single-word keyword
                similarity = token.similarity(kw_token)
            else:
                # Multi-word keyword (e.g., "Business Days") — average similarity
                similarity = sum(token.similarity(t) for t in kw_token) / len(kw_token)

            if similarity > 0.6:
                print(f"Doc token '{token.text}' is similar to keyword '{kw}' (Similarity: {similarity:.2f})")

Doc token 'License' is similar to keyword 'License' (Similarity: 1.00)
Doc token 'Agreement' is similar to keyword 'agreement' (Similarity: 1.00)
Doc token 'LICENSE' is similar to keyword 'License' (Similarity: 1.00)
Doc token 'AGREEMENT' is similar to keyword 'agreement' (Similarity: 1.00)
Doc token 'Agreement' is similar to keyword 'agreement' (Similarity: 1.00)
Doc token 'Date' is similar to keyword 'date' (Similarity: 1.00)
Doc token 'Licensee' is similar to keyword 'Licensor' (Similarity: 0.65)
Doc token 'Company' is similar to keyword 'Company' (Similarity: 1.00)
Doc token 'Licensor' is similar to keyword 'Licensor' (Similarity: 1.00)
Doc token 'Company' is similar to keyword 'Company' (Similarity: 1.00)
Doc token 'Licensor' is similar to keyword 'Licensor' (Similarity: 1.00)
Doc token 'Licensee' is similar to keyword 'Licensor' (Similarity: 0.65)
Doc token 'license' is similar to keyword 'License' (Similarity: 1.00)
Doc token 'Licensee' is similar to keyword 'Licensor' (Similari

In [None]:
for token in Copyright_Clause:
    print(f"Token: {token.text} -> Lemma: {token.lemma_}")


Token: 
 -> Lemma: 

Token: Copyright -> Lemma: Copyright
Token: License -> Lemma: License
Token: Agreement -> Lemma: Agreement
Token: 

 -> Lemma: 


Token: This -> Lemma: this
Token: COPYRIGHT -> Lemma: copyright
Token: LICENSE -> Lemma: LICENSE
Token: AGREEMENT -> Lemma: AGREEMENT
Token: ( -> Lemma: (
Token: the -> Lemma: the
Token: " -> Lemma: "
Token: Agreement -> Lemma: Agreement
Token: " -> Lemma: "
Token: ) -> Lemma: )
Token: , -> Lemma: ,
Token: dated -> Lemma: date
Token: as -> Lemma: as
Token: of -> Lemma: of
Token: 01/01/2018 -> Lemma: 01/01/2018
Token: ( -> Lemma: (
Token: the -> Lemma: the
Token: " -> Lemma: "
Token: Effective -> Lemma: effective
Token: Date -> Lemma: date
Token: " -> Lemma: "
Token: ) -> Lemma: )
Token: , -> Lemma: ,
Token: is -> Lemma: be
Token: made -> Lemma: make
Token: by -> Lemma: by
Token: and -> Lemma: and
Token: between -> Lemma: between
Token: " -> Lemma: "
Token: Me -> Lemma: I
Token: " -> Lemma: "
Token: ( -> Lemma: (
Token: " -> Lemma: "
Toke

In [None]:
if token.text.lower() in ["licensor", "licensee"]:
    print(f"Custom lemma for '{token.text}' → 'license'")


In [None]:
import spacy

# Load the large English model
nlp = spacy.load("en_core_web_lg")

# Define legal text
legal_text = """
Copyright License Agreement

This COPYRIGHT LICENSE AGREEMENT (the "Agreement"), dated as of 01/01/2018 (the "Effective Date"), is made by and between "Me" ("Licensee"), a "NY" "Company" with offices located at "1 Broadway", and "Myself" ("Licensor"), a "NY" "Company" with offices located at "2 Broadway".

WHEREAS, Licensor solely and exclusively owns or controls the Work (as defined below) and wishes to grant to Licensee a license to the Work, and Licensee wishes to obtain a license to the Work for the uses and purposes described herein, each subject to the terms and conditions set forth herein.

NOW, THEREFORE, in consideration of the mutual covenants, terms, and conditions set forth herein, and for other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the parties agree as follows:

License.

Grant of Rights. Subject to the terms and conditions of this Agreement, Licensor hereby grants to Licensee and its affiliates during the Term (as defined below) an exclusive, transferable right and license in the "United States" (the "Territory"), to reproduce, publicly perform, display, transmit, and distribute the Work, including translate, alter, modify, and create derivative works of the Work, through all media now known or hereinafter developed for purposes of "stuff". The "Work" is defined as "other stuff".

Permissions. Licensor has obtained from all persons and entities who are, or whose trademark or other property is, identified, depicted, or otherwise referred to in the Work, such written and signed licenses, permissions, waivers, and consents (collectively, "Permissions" and each, individually, a "Permission"), including those relating to publicity, privacy, and any intellectual property rights, as are or reasonably may be expected to be necessary for Licensee to exercise its rights in the Work as permitted under this Agreement, without incurring any payment or other obligation to, or otherwise violating any right of, any such person or entity.

Copyright Notices. Licensee shall ensure that its use of the Work is marked with the appropriate copyright notices specified by Licensor in a reasonably prominent position in the order and manner provided by Licensor. Licensee shall abide by the copyright laws and what are considered to be sound practices for copyright notice provisions in the Territory. Licensee shall not use any copyright notices that conflict with, confuse, or negate the notices Licensor provides and requires hereunder.

Payment. As consideration in full for the rights granted herein, Licensee shall pay Licensor a one-time fee in the amount of "one hundred US Dollars" (100.0 USD) upon execution of this Agreement, payable as follows: "bank transfer".

General.

Interpretation. For purposes of this Agreement, (a) the words "include," "includes," and "including" are deemed to be followed by the words "without limitation"; (b) the word "or" is not exclusive; and (c) the words "herein," "hereof," "hereby," "hereto," and "hereunder" refer to this Agreement as a whole. This Agreement is intended to be construed without regard to any presumption or rule requiring construction or interpretation against the party drafting an instrument or causing any instrument to be drafted.

Entire Agreement. This Agreement, including and together with any related attachments, constitutes the sole and entire agreement of the parties with respect to the subject matter contained herein, and supersedes all prior and contemporaneous understandings, agreements, representations, and warranties, both written and oral, with respect to such subject matter.

Severability. If any term or provision of this Agreement is invalid, illegal, or unenforceable in any jurisdiction, such invalidity, illegality, or unenforceability will not affect the enforceability of any other term or provision of this Agreement, or invalidate or render unenforceable such term or provision in any other jurisdiction. [Upon a determination that any term or provision is invalid, illegal, or unenforceable, [the parties shall negotiate in good faith to/the court may] modify this Agreement to effect the original intent of the parties as closely as possible in order that the transactions contemplated hereby be consummated as originally contemplated to the greatest extent possible.]

Assignment. Licensee may freely assign or otherwise transfer all or any of its rights, or delegate or otherwise transfer all or any of its obligations or performance, under this Agreement without Licensor's consent. This Agreement is binding upon and inures to the benefit of the parties hereto and their respective permitted successors and assigns.
"""

# Process the text with spaCy
doc = nlp(legal_text)

# Optional: define custom overrides for certain proper nouns
custom_lemma_map = {
    "Licensor": "license",
    "Licensee": "license"
}

# Print tokens and their lemmas
for token in doc:
    if token.text in custom_lemma_map:
        lemma = custom_lemma_map[token.text]
    else:
        lemma = token.lemma_
    print(f"Token: {token.text} -> Lemma: {lemma}")


Token: 
 -> Lemma: 

Token: Copyright -> Lemma: Copyright
Token: License -> Lemma: License
Token: Agreement -> Lemma: Agreement
Token: 

 -> Lemma: 


Token: This -> Lemma: this
Token: COPYRIGHT -> Lemma: copyright
Token: LICENSE -> Lemma: LICENSE
Token: AGREEMENT -> Lemma: AGREEMENT
Token: ( -> Lemma: (
Token: the -> Lemma: the
Token: " -> Lemma: "
Token: Agreement -> Lemma: Agreement
Token: " -> Lemma: "
Token: ) -> Lemma: )
Token: , -> Lemma: ,
Token: dated -> Lemma: date
Token: as -> Lemma: as
Token: of -> Lemma: of
Token: 01/01/2018 -> Lemma: 01/01/2018
Token: ( -> Lemma: (
Token: the -> Lemma: the
Token: " -> Lemma: "
Token: Effective -> Lemma: effective
Token: Date -> Lemma: date
Token: " -> Lemma: "
Token: ) -> Lemma: )
Token: , -> Lemma: ,
Token: is -> Lemma: be
Token: made -> Lemma: make
Token: by -> Lemma: by
Token: and -> Lemma: and
Token: between -> Lemma: between
Token: " -> Lemma: "
Token: Me -> Lemma: I
Token: " -> Lemma: "
Token: ( -> Lemma: (
Token: " -> Lemma: "
Toke