In [1]:
pip install transformers tensorflow torch

Collecting transformersNote: you may need to restart the kernel to use updated packages.

  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Downloading torch-2.4.1-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy->torch)
  Downloading mpmath-1.3.0-p

In [4]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, AdamW

# Load the JSON data with constitutional text

def load_data(json_path, text_field="cleaned_text", label_field="label"):
    with open(json_path, "r") as f:
        data = json.load(f)
    
    # Extract text and optional labels (if available)
    texts = [page[text_field] for page in data["pages"] if text_field in page]
    labels = [page.get(label_field, 0) for page in data["pages"]]  # Use default label 0 if not present
    
    return texts, labels

# Tokenize the text data
def encode_texts(texts, tokenizer, max_length=128):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="tf",
            return_attention_mask=True,
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    return {
        "input_ids": tf.concat(input_ids, axis=0),
        "attention_mask": tf.concat(attention_masks, axis=0),
    }





In [6]:
# Load constitutional data
json_path='../data/final_clean.json'  # Update path
texts, labels = load_data(json_path)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Encode text data
inputs = encode_texts(texts, tokenizer)
labels = tf.convert_to_tensor(labels)

# Prepare TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels)).shuffle(len(texts)).batch(16)  # Adjust batch size


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
# Load BERT model for binary classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Set up optimizer and loss function
from transformers import create_optimizer

# Define the optimizer for the Hugging Face Transformers model
optimizer, _ = create_optimizer(
    init_lr=2e-5,  # Initial learning rate
    num_train_steps=1000,  # Approximate number of steps
    num_warmup_steps=100  # Approximate warm-up steps
)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Fine-tune the model on constitutional data
epochs = 3
history = model.fit(dataset, epochs=epochs)


Epoch 1/3


Epoch 2/3
Epoch 3/3


In [10]:
# Save model and tokenizer
model.save_pretrained("fine_tuned_law_bot_model")
tokenizer.save_pretrained("fine_tuned_law_bot_model")


('fine_tuned_law_bot_model\\tokenizer_config.json',
 'fine_tuned_law_bot_model\\special_tokens_map.json',
 'fine_tuned_law_bot_model\\vocab.txt',
 'fine_tuned_law_bot_model\\added_tokens.json')

In [12]:
# Load fine-tuned model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("fine_tuned_law_bot_model")
tokenizer = BertTokenizer.from_pretrained("fine_tuned_law_bot_model")

# Define a function to get predictions
def predict(text):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=128)
    outputs = model(inputs)
    logits = outputs.logits
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    return predicted_class

# Test prediction
print(predict("what is punishment for stealing"))


Some layers from the model checkpoint at fine_tuned_law_bot_model were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at fine_tuned_law_bot_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


0
