In [None]:
# Install required packages
!pip install transformers datasets torch pandas scikit-learn

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pickle
import logging

# Setup logging
logging.basicConfig(filename='finetune.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define clause_names (from your provided list)
clause_names = [
    'Affiliate License-Licensee', 'Affiliate License-Licensor', 'Agreement Date', 'Anti-Assignment',
    'Audit Rights', 'Cap On Liability', 'Change Of Control', 'Competitive Restriction Exception',
    'Covenant Not To Sue', 'Document Name', 'Effective Date', 'Exclusivity', 'Expiration Date',
    'Governing Law', 'Insurance', 'Ip Ownership Assignment', 'Irrevocable Or Perpetual License',
    'Joint Ip Ownership', 'License Grant', 'Liquidated Damages', 'Minimum Commitment',
    'Most Favored Nation', 'No-Solicit Of Customers', 'No-Solicit Of Employees', 'Non-Compete',
    'Non-Disparagement', 'Non-Transferable License', 'Notice Period To Terminate Renewal',
    'Parties', 'Post-Termination Services', 'Price Restrictions', 'Renewal Term',
    'Revenue/Profit Sharing', 'Rofr/Rofo/Rofn', 'Source Code Escrow', 'Termination For Convenience',
    'Third Party Beneficiary', 'Uncapped Liability', 'Unlimited/All-You-Can-Eat-License',
    'Volume Restriction', 'Warranty Duration', 'Uncategorized'
]

# Load dataset
DATASET_PATH = './clause_dataset.csv'
try:
    df = pd.read_csv(DATASET_PATH)
    df['labels'] = df['labels'].apply(lambda x: [float(i) for i in eval(x)])  # Convert to float32 list
    dataset = Dataset.from_pandas(df)
except FileNotFoundError:
    logging.error(f"Dataset file {DATASET_PATH} not found.")
    raise

# Split into train and validation (80-20 split)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(
    'nlpaueb/legal-bert-base-uncased',
    num_labels=len(clause_names),
    problem_type='multi_label_classification'
)

# Move model to CPU explicitly (remove this if GPU is available)
device = torch.device('cpu')
model.to(device)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int()
    accuracy = (predictions == labels).float().mean().item()
    return {'accuracy': accuracy}

# Set training arguments
training_args = TrainingArguments(
    output_dir='./legalbert-finetuned',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    eval_strategy='epoch',  # Updated from evaluation_strategy
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir='./logs',
    logging_steps=10,
    weight_decay=0.01,
    no_cuda=True,  # Explicitly disable CUDA for CPU-only
    dataloader_pin_memory=False,  # Disable pin_memory for CPU
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train model with error handling
try:
    trainer.train()
except Exception as e:
    logging.error(f"Training failed: {str(e)}")
    raise

# Save model and tokenizer to .pkl
MODEL_PKL_PATH = './legalbert_finetuned.pkl'
with open(MODEL_PKL_PATH, 'wb') as f:
    pickle.dump({'model': model, 'tokenizer': tokenizer}, f)
logging.info(f"Fine-tuned LegalBERT and tokenizer saved to {MODEL_PKL_PATH}")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6128,0.513442,0.967949
2,0.4393,0.384303,0.973443
3,0.3736,0.319057,0.973443
4,0.3207,0.289853,0.973443
5,0.3075,0.282421,0.973443


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [2]:
!pip install datasets transformers scikit-learn spacy pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 2.9/12.8 MB 15.3 MB/s eta 0:00:01
     ------------------ --------------------- 6.0/12.8 MB 14.8 MB/s eta 0:00:01
     ---------------------------- ----------- 9.2/12.8 MB 14.6 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 14.5 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 12.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 11.6 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
