In [1]:
import pandas as pd

# Load preprocessed dataset
df = pd.read_csv('../data/clause_classification.csv')
print(df.head())
print(df['label'].value_counts())


                                                text  \
0  EXHIBIT 10.6\n\n                              ...   
1  EXHIBIT 10.6\n\n                              ...   
2  EXHIBIT 10.6\n\n                              ...   
3  EXHIBIT 10.6\n\n                              ...   
4  EXHIBIT 10.6\n\n                              ...   

                                              clause  label  
0  Highlight the parts (if any) of this contract ...      1  
1  Highlight the parts (if any) of this contract ...      1  
2  Highlight the parts (if any) of this contract ...      1  
3  Highlight the parts (if any) of this contract ...      1  
4  Highlight the parts (if any) of this contract ...      1  
label
0    14208
1     6702
Name: count, dtype: int64


In [2]:
# Use only a portion for fast training during development
df = df.sample(10000, random_state=42).reset_index(drop=True)


In [3]:
from datasets import Dataset

# Convert Pandas dataframe to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Split into train/test sets
dataset = dataset.train_test_split(test_size=0.2)


In [4]:
#Tokenize Text with BERT Tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'clause'])  # Keep only model inputs
tokenized_dataset.set_format("torch")


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
# Load BERT model for Classification

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import transformers
print(transformers.__version__)


4.52.3


In [7]:
#Train the model using trainer API

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/clause_classifier",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs",
    logging_steps=100,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import sys
print(sys.executable)


In [None]:
import accelerate
print(accelerate.__version__)
