In [7]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("lex_glue", "ledgar")["train"]  # LEDGAR only has train split

# Inspect label distribution
from collections import Counter
label_counts = Counter(dataset["label"])
print("Available labels:", label_counts)

Available labels: Counter({47: 3167, 65: 2493, 26: 2427, 38: 2340, 79: 1808, 85: 1469, 2: 1467, 7: 1327, 41: 1224, 89: 1166, 88: 1112, 51: 1104, 87: 1089, 58: 1034, 20: 1012, 45: 1009, 46: 976, 19: 960, 49: 936, 97: 878, 92: 866, 68: 856, 96: 768, 61: 741, 43: 677, 75: 664, 11: 656, 74: 621, 10: 620, 31: 590, 13: 588, 48: 587, 42: 578, 76: 569, 29: 566, 83: 563, 9: 549, 18: 541, 99: 517, 98: 499, 59: 479, 53: 473, 24: 469, 33: 462, 67: 459, 95: 451, 63: 445, 84: 444, 22: 444, 16: 434, 80: 419, 66: 409, 23: 406, 12: 397, 15: 389, 55: 386, 6: 385, 86: 380, 54: 366, 64: 363, 28: 354, 32: 354, 91: 353, 4: 352, 17: 349, 71: 345, 93: 341, 62: 327, 77: 327, 81: 326, 0: 326, 39: 324, 73: 323, 90: 317, 57: 313, 35: 308, 52: 304, 44: 300, 27: 278, 50: 271, 30: 263, 40: 258, 60: 255, 1: 252, 36: 244, 82: 238, 34: 229, 56: 202, 37: 201, 69: 190, 5: 178, 21: 175, 25: 168, 94: 131, 70: 125, 78: 118, 3: 106, 72: 47, 8: 31, 14: 23})


In [8]:
# Pick two common labels that exist in the dataset
LABEL1, LABEL2 = "GOVERNING LAW", "AMENDMENT"  # Adjust based on your inspection

# Filter and binarize
filtered = dataset.filter(lambda x: x["label"] in [LABEL1, LABEL2])
filtered = filtered.map(lambda x: {
    "text": x["text"],
    "label": 0 if x["label"] == LABEL1 else 1  # Convert to binary
})

print(f"Filtered dataset size: {len(filtered)}")

Filter: 100%|██████████| 60000/60000 [00:00<00:00, 274453.83 examples/s]

Filtered dataset size: 0





In [10]:
# If filtered is empty, use this synthetic dataset
import pandas as pd
from datasets import Dataset  # Import Dataset class

if len(filtered) == 0:
    print("Using synthetic data as fallback")
    data = {
        "text": [
            "This Agreement shall be governed by New York law",  # 0
            "Party A shall indemnify Party B for all losses",     # 1
            "The term 'Confidential Information' means...",       # 0
            "Vendor must deliver goods by January 1"             # 1
        ],
        "label": [0, 1, 0, 1]
    }
    filtered = Dataset.from_pandas(pd.DataFrame(data))

Using synthetic data as fallback


In [None]:
# Split dataset
split = filtered.train_test_split(test_size=0.2)

# Tokenization (CPU-optimized)
# from transformers import DistilBertTokenizer
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")




def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


tokenized = split.map(tokenize, batched=True)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:


# Training
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Reduced for CPU
    num_train_epochs=2,             # Fewer epochs for CPU
    evaluation_strategy="epoch",
    logging_steps=50,
    report_to="none"                # Disable MLflow logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
)
trainer.train()

Map: 100%|██████████| 3/3 [00:00<00:00, 648.50 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 256.58 examples/s]


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [4]:
from datasets import load_dataset

# Load LEDGAR dataset correctly
dataset = load_dataset("lex_glue", "ledgar")

# Access the train split (LEDGAR only has train split)
train_dataset = dataset["train"]

# Shuffle and select subset properly
subset = train_dataset.shuffle(seed=42).select(range(2000))

In [6]:
# Get the actual label names from the dataset
print(subset.features["label"].names)  # Check available labels

# We'll use these two common classes for binary classification
label_map = {
    "Obligation": 1,    # High risk
    "Definition": 0     # Low risk
}

# Filter and map labels
def map_label(example):
    return {"label": label_map.get(example["label"]), 
            "text": example["text"]}

filtered_dataset = subset.filter(
    lambda x: x["label"] in ["Obligation", "Definition"]
).map(map_label)

# Verify
print(filtered_dataset[0])  # Should show text with 0 or 1 label

['Adjustments', 'Agreements', 'Amendments', 'Anti-Corruption Laws', 'Applicable Laws', 'Approvals', 'Arbitration', 'Assignments', 'Assigns', 'Authority', 'Authorizations', 'Base Salary', 'Benefits', 'Binding Effects', 'Books', 'Brokers', 'Capitalization', 'Change In Control', 'Closings', 'Compliance With Laws', 'Confidentiality', 'Consent To Jurisdiction', 'Consents', 'Construction', 'Cooperation', 'Costs', 'Counterparts', 'Death', 'Defined Terms', 'Definitions', 'Disability', 'Disclosures', 'Duties', 'Effective Dates', 'Effectiveness', 'Employment', 'Enforceability', 'Enforcements', 'Entire Agreements', 'Erisa', 'Existence', 'Expenses', 'Fees', 'Financial Statements', 'Forfeitures', 'Further Assurances', 'General', 'Governing Laws', 'Headings', 'Indemnifications', 'Indemnity', 'Insurances', 'Integration', 'Intellectual Property', 'Interests', 'Interpretations', 'Jurisdictions', 'Liens', 'Litigations', 'Miscellaneous', 'Modifications', 'No Conflicts', 'No Defaults', 'No Waivers', 'Non-

Filter: 100%|██████████| 2000/2000 [00:00<00:00, 33414.49 examples/s]


IndexError: Invalid key: 0 is out of bounds for size 0

In [2]:
from datasets import load_dataset

dataset = load_dataset("lex_glue", "contract_analysis")  # 5,000 labeled clauses
dataset = dataset.shuffle(seed=42).select(range(1000))  # Smaller subset for demo

ValueError: BuilderConfig 'contract_analysis' not found. Available: ['case_hold', 'ecthr_a', 'ecthr_b', 'eurlex', 'ledgar', 'scotus', 'unfair_tos']