In [4]:
import pandas as pd

# 1. Load the master CSV
df_master = pd.read_csv(r'C:\Repositories\USA_Project\Graph-Test\master_clauses.csv')

# 2. Exact column names for our first six categories:
context_cols = [
    "Parties",
    "Agreement Date",
    "Effective Date",
    "Expiration Date",
    "Renewal Term",
    "Notice Period To Terminate Renewal"
]

# Note the precise answer column names (matching the CSV):
answer_cols = [
    "Parties-Answer",
    "Agreement Date-Answer",
    "Effective Date-Answer",
    "Expiration Date-Answer",
    "Renewal Term-Answer",
    "Notice Period To Terminate Renewal- Answer"
]

# 3. Subset to these columns + Filename
cols_to_keep = ["Filename"] + context_cols + answer_cols
df_sub = df_master[cols_to_keep].copy()

# 4. Filter to rows where at least one answer is non-empty/positive
mask = pd.Series(False, index=df_sub.index)
for ans in answer_cols:
    mask |= df_sub[ans].notna() & ~df_sub[ans].isin(["No", "[]", ""])
df_mini = df_sub[mask].reset_index(drop=True)

# 5. Inspect and save
print(f"Filtered to {len(df_mini)} contracts out of {len(df_master)} total.")
print(df_mini.head())

df_mini.to_csv(r'C:\Repositories\USA_Project\Graph-Test\filtered_master_clauses.csv', index=False)
print("Saved filtered CSV to /mnt/data/mini_cuad_master_first6.csv")


Filtered to 509 contracts out of 510 total.
                                            Filename  \
0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
1  EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
2  FulucaiProductionsLtd_20131223_10-Q_EX-10.9_83...   
3  GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10...   
4  IdeanomicsInc_20160330_10-K_EX-10.26_9512211_E...   

                                             Parties  \
0  ['BIRCH FIRST GLOBAL INVESTMENTS INC.', 'MA', ...   
1  ['EuroMedia Holdings Corp.', 'Rogers', 'Rogers...   
2  ['Producer', 'Fulucai Productions Ltd.', 'Conv...   
3  ['PSiTech Corporation', 'Licensor', 'Licensee'...   
4  ['YOU ON DEMAND HOLDINGS, INC.', 'Licensor', '...   

                           Agreement Date  \
0  ['8th day of May 2014', 'May 8, 2014']   
1                      ['July 11 , 2006']   
2                   ['November 15, 2012']   
3                        ['Feb 10, 2014']   
4                   ['December 21, 2015']   

           

In [5]:
import pandas as pd

# 1. Load the filtered master CSV
df_filtered = pd.read_csv(r'C:\Repositories\USA_Project\Graph-Test\filtered_master_clauses.csv')

# 2. Define the six clause context columns exactly
context_cols = [
    "Parties",
    "Agreement Date",
    "Effective Date",
    "Expiration Date",
    "Renewal Term",
    "Notice Period To Terminate Renewal"
]

# 3. Explode into one row per non-empty snippet
rows = []
for doc_idx, row in df_filtered.reset_index(drop=True).iterrows():
    for cat in context_cols:
        snippet = row[cat]
        if pd.notna(snippet) and snippet not in ["", "No", "[]"]:
            rows.append({
                "doc_idx": doc_idx,
                "filename": row["Filename"],
                "category": cat,
                "snippet_text": snippet.strip()
            })

snips_df = pd.DataFrame(rows)

# 4. Inspect and save
print(f"Extracted {len(snips_df)} snippets:")
print(snips_df.head())

snips_df.to_csv(r'C:\Repositories\USA_Project\Graph-Test\mini_cuad_snippets.csv', index=False)
print("Snippet‐level CSV saved to /mnt/data/mini_cuad_snippets.csv")


Extracted 2069 snippets:
   doc_idx                                           filename  \
0        0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
1        0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
2        0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
3        0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
4        0  CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   

          category                                       snippet_text  
0          Parties  ['BIRCH FIRST GLOBAL INVESTMENTS INC.', 'MA', ...  
1   Agreement Date             ['8th day of May 2014', 'May 8, 2014']  
2   Effective Date  ['This agreement shall begin upon the date of ...  
3  Expiration Date  ['This agreement shall begin upon the date of ...  
4     Renewal Term  ['This agreement shall begin upon the date of ...  
Snippet‐level CSV saved to /mnt/data/mini_cuad_snippets.csv


In [6]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer

# 1. Load snippet table
snips = pd.read_csv(r'C:\Repositories\USA_Project\Graph-Test\mini_cuad_snippets.csv')

# 2. Initialize a legal SBERT model
model = SentenceTransformer('Stern5497/sbert-legal-xlm-roberta-base')

# 3. Embed snippets
snip_texts = snips['snippet_text'].tolist()
snip_embs  = model.encode(snip_texts, show_progress_bar=True)
snips['embedding'] = list(snip_embs)
snips.to_pickle(r'C:\Repositories\USA_Project\Graph-Test\mini_cuad_snippets_emb.pkl')
print(f"Encoded {len(snip_embs)} snippets.")

# 4. Embed full contracts
master   = pd.read_csv(r'C:\Repositories\USA_Project\Graph-Test\filtered_master_clauses.csv')
files    = master['Filename'].unique()
BASE_TXT_DIR = r'C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt'

doc_texts = []
for fname in files:
    txt_name = fname.replace('.pdf', '.txt')
    txt_path = os.path.join(BASE_TXT_DIR, txt_name)
    try:
        with open(txt_path, encoding='utf-8', errors='ignore') as f:
            doc_texts.append(f.read())
    except FileNotFoundError:
        print(f"⚠️ Missing {txt_path}, adding empty text")
        doc_texts.append("")

doc_embs = model.encode(doc_texts, show_progress_bar=True)
docs_df  = pd.DataFrame({'Filename': files, 'embedding': list(doc_embs)})
docs_df.to_pickle(r'C:\Repositories\USA_Project\Graph-Test\mini_cuad_docs_emb.pkl')
print(f"Encoded {len(doc_embs)} documents.")



  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Batches: 100%|██████████| 65/65 [03:26<00:00,  3.18s/it]


Encoded 2069 snippets.
⚠️ Missing C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt\HarpoonTherapeuticsInc_20200312_10-K_EX-10.18_12051356_EX-10.18_Development Agreement.PDF, adding empty text
⚠️ Missing C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt\SLINGERBAGINC_05_27_2020-EX-10.7-CONSULTING AGREEMENT.PDF, adding empty text
⚠️ Missing C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt\KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSULTING AGREEMENT.PDF, adding empty text
⚠️ Missing C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt\SPHERE3DCORP_06_24_2020-EX-10.12-CONSULTING AGREEMENT.PDF, adding empty text
⚠️ Missing C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt\GLOBALTECHNOLOGIESLTD_06_08_2020-EX-10.16-CONSULTING AGREEMENT.PDF, adding empty text
⚠️ Missing C:\Repositories\USA_Project\Graph-Test\CUAD_v1\full_contract_txt\EMERALDHEALTHTHERAPEUTICSINC_06_10_2020-EX-4.5-CONSULTING AGREEMENT - DR. GAETANO MOREL

Batches: 100%|██████████| 16/16 [02:45<00:00, 10.36s/it]

Encoded 509 documents.



