In [1]:
!pip install spacy pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------

In [6]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Load the CUAD dataset
file_path = 'master_clauses.csv'
df = pd.read_csv(file_path)
df.columns

Index(['Filename', 'Document Name', 'Document Name-Answer', 'Parties',
       'Parties-Answer', 'Agreement Date', 'Agreement Date-Answer',
       'Effective Date', 'Effective Date-Answer', 'Expiration Date',
       'Expiration Date-Answer', 'Renewal Term', 'Renewal Term-Answer',
       'Notice Period To Terminate Renewal',
       'Notice Period To Terminate Renewal- Answer', 'Governing Law',
       'Governing Law-Answer', 'Most Favored Nation',
       'Most Favored Nation-Answer', 'Competitive Restriction Exception',
       'Competitive Restriction Exception-Answer', 'Non-Compete',
       'Non-Compete-Answer', 'Exclusivity', 'Exclusivity-Answer',
       'No-Solicit Of Customers', 'No-Solicit Of Customers-Answer',
       'No-Solicit Of Employees', 'No-Solicit Of Employees-Answer',
       'Non-Disparagement', 'Non-Disparagement-Answer',
       'Termination For Convenience', 'Termination For Convenience-Answer',
       'Rofr/Rofo/Rofn', 'Rofr/Rofo/Rofn-Answer', 'Change Of Control',
      

In [15]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

# Clean column names
df.columns = df.columns.str.strip()

# Build clause context/answer pairs
column_pairs = []
for col in df.columns:
    if col.endswith("-Answer"):
        base_col = col.replace("-Answer", "").strip()
        if base_col in df.columns:
            column_pairs.append((base_col, col))

# Prepare training data
TRAIN_DATA = []
for base_col, ans_col in column_pairs:
    label = base_col.upper().replace(" ", "_")  # Create a consistent NER label like 'EFFECTIVE_DATE'
    
    for _, row in df.iterrows():
        context = str(row[base_col])
        answer = str(row[ans_col])

        if pd.isna(context) or pd.isna(answer) or answer.strip().lower() == "no":
            continue

        start = context.lower().find(answer.lower())
        if start == -1:
            continue

        end = start + len(answer)
        entities = [(start, end, label)]
        TRAIN_DATA.append((context, {"entities": entities}))

print(f"✅ Total training examples: {len(TRAIN_DATA)}")
print("Example:", TRAIN_DATA[0])


✅ Total training examples: 892
Example: ("['MARKETING AFFILIATE AGREEMENT']", {'entities': [(2, 31, 'DOCUMENT_NAME')]})


In [16]:
from collections import Counter
label_counts = Counter(label for _, ann in TRAIN_DATA for _, _, label in ann["entities"])
print("Label counts in training data:")
print(label_counts)


Label counts in training data:
Counter({'DOCUMENT_NAME': 473, 'GOVERNING_LAW': 390, 'AGREEMENT_DATE': 14, 'RENEWAL_TERM': 6, 'EXPIRATION_DATE': 5, 'EFFECTIVE_DATE': 2, 'PARTIES': 1, 'LICENSE_GRANT': 1})


In [8]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

for text, annot in tqdm(TRAIN_DATA):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)

# Save the training data
doc_bin.to_disk("train_data.spacy")
print(" train_data.spacy saved successfully.")


100%|███████████████████████████████████████████████████████████████████████████████████████████| 892/892 [00:00<00:00, 1445.60it/s]


 train_data.spacy saved successfully.


In [1]:
import spacy

nlp = spacy.load("output/model-best")

text = "This agreement shall be governed by the laws of California and becomes effective on May 5, 2020."

doc = nlp(text)
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


California -> GOVERNING_LAW
