In [None]:
# Install required libraries
!pip install transformers datasets torch

In [None]:
# Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
import os

# Folder containing the text files
folder_path = 'data/raw/acts'

# List to store the contents of each act
documents = []

# Iterate over files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            documents.append(file.read())


In [None]:
# our data is in a text file or DataFrame with a 'text' column
import pandas as pd
from datasets import Dataset

data = pd.read_csv('data/raw/acts/section_Admiralty Court Act, 1861.txt')  # unsupervised learning data in form of basic text.
dataset = Dataset.from_pandas(data)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Train/test split (optional, if your dataset contains labels)
train_dataset, test_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()


In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')

# Tokenize the documents
encoded_inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt", max_length=512)


In [None]:
from transformers import BertModel

# Load the pre-trained Legal BERT model
model = BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

# Get the embeddings (feature vectors)
with torch.no_grad():  # No need to compute gradients for inference
    embeddings = model(**encoded_inputs).last_hidden_state


In [None]:
from transformers import BertForMaskedLM, Trainer, TrainingArguments

# Load the BERT model for masked language modeling (MLM)
model_mlm = BertForMaskedLM.from_pretrained('nlpaueb/legal-bert-base-uncased')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
)

# Create a dataset from your encoded inputs
from torch.utils.data import Dataset, DataLoader

class LegalDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

# Create Dataset and DataLoader
train_dataset = LegalDataset(encoded_inputs)
train_loader = DataLoader(train_dataset, batch_size=8)

# Initialize the Trainer for MLM
trainer = Trainer(
    model=model_mlm,
    args=training_args,
    train_dataset=train_dataset
)

# Fine-tune the model using MLM
trainer.train()


In [None]:
def query_legal_advisor(query):
    inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model_mlm(**inputs)
        predictions = outputs.logits
    return tokenizer.decode(torch.argmax(predictions, dim=-1)[0])

# Example legal query
user_query = "What is the punishment for theft under the criminal law?"
response = query_legal_advisor(user_query)
print(f'Response: {response}')
