<a href="https://colab.research.google.com/github/Shyam3624/PAI-LAB/blob/main/Legal%20Text%20Classification%20with%20BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch.optim as optim
from tqdm import tqdm

In [None]:
dataset_path = '/content/legal_text_classification.csv'
df = pd.read_csv(dataset_path)

df.dropna(inplace=True)

label_encoder = LabelEncoder()
df['case_outcome_encoded'] = label_encoder.fit_transform(df['case_outcome'])

class_counts = df['case_outcome_encoded'].value_counts()
print("Class distribution in training data:\n", class_counts)

Class distribution in training data:
 case_outcome_encoded
3    12110
8     4363
1     2438
7     2252
4     1699
5     1018
6      603
9      112
2      108
0      106
Name: count, dtype: int64


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['case_outcome_encoded'], random_state=42)

majority_class_size = train_df['case_outcome_encoded'].value_counts().max()
train_df_balanced = train_df.groupby('case_outcome_encoded', group_keys=False) \
                            .apply(lambda x: x.sample(majority_class_size, replace=True)).reset_index(drop=True)

balanced_class_counts = train_df_balanced['case_outcome_encoded'].value_counts()
print("Class distribution after oversampling:\n", balanced_class_counts)

Class distribution after oversampling:
 case_outcome_encoded
0    9688
1    9688
2    9688
3    9688
4    9688
5    9688
6    9688
7    9688
8    9688
9    9688
Name: count, dtype: int64


  .apply(lambda x: x.sample(majority_class_size, replace=True)).reset_index(drop=True)


In [None]:
class LegalDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, stride):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        case_text = str(self.data.iloc[index]['case_text'])
        case_outcome = self.data.iloc[index]['case_outcome_encoded']

        encoding = self.tokenizer.encode_plus(
            case_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(case_outcome, dtype=torch.long)
        }

In [None]:
legalbert_model_name = 'nlpaueb/legal-bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(legalbert_model_name)
model = BertForSequenceClassification.from_pretrained(legalbert_model_name, num_labels=len(label_encoder.classes_))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
max_length = 256
stride = 64
batch_size = 16
epochs = 1
learning_rate = 2e-5
weight_decay = 0.01

In [None]:
train_dataset = LegalDataset(train_df_balanced, tokenizer, max_length, stride)
val_dataset = LegalDataset(val_df, tokenizer, max_length, stride)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df_balanced['case_outcome_encoded']), y=train_df_balanced['case_outcome_encoded'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [None]:
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=total_steps)

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

    return correct_predictions / len(data_loader.dataset), np.mean(losses)

In [None]:
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    y_preds = []
    y_true = []

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            losses.append(loss.item())
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
            y_preds.extend(preds.cpu().numpy())
            y_true.extend(labels.cpu().numpy())

    return correct_predictions / len(data_loader.dataset), np.mean(losses), y_true, y_preds

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(epochs):
    print(f'\nEpoch {epoch + 1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Train loss: {train_loss:.3f}, accuracy: {train_acc:.3f}')
    val_acc, val_loss, y_true, y_preds = eval_model(model, val_loader, device)
    print(f'Validation loss: {val_loss:.3f}, accuracy: {val_acc:.3f}')


Epoch 1/1


100%|██████████| 6055/6055 [1:28:17<00:00,  1.14it/s]


Train loss: 1.027, accuracy: 0.625


100%|██████████| 311/311 [01:45<00:00,  2.95it/s]

Validation loss: 1.718, accuracy: 0.395



