In [1]:
import json

# Load the JSON data of the constitution
with open("../data/final_clean.json", "r") as file:
    constitution_data = json.load(file)

# Function to create Q&A pairs based on context
def create_qa_pairs(context):
    # Example questions based on the context
    questions = [
        "What does this section state?",
        "What rights are mentioned in this section?",
        "Who is responsible for the enforcement of this section?",
        "What procedures are outlined in this section?"
    ]
    
    # Create Q&A pairs
    qa_pairs = []
    for question in questions:
        answer_start = context.lower().find("shall")  # Example heuristic for finding answers
        answer = context[answer_start:answer_start + 100] if answer_start != -1 else "Not found"
        qa_pairs.append({
            "question": question,
            "answers": [{"text": answer, "answer_start": answer_start}],
            "id": f"q_{hash(question)}"  # Unique ID for each Q&A
        })
    
    return qa_pairs

# Segment constitution into structured data
structured_data = {"data": [{"title": "Constitution", "paragraphs": []}]}

# Iterate over each page in the constitution
for page in constitution_data["pages"]:
    context = page["cleaned_text"]  # Using cleaned text for Q&A creation
    qa_pairs = create_qa_pairs(context)
    
    # Append the context and Q&A pairs
    structured_data["data"][0]["paragraphs"].append({
        "context": context,
        "qas": qa_pairs
    })

# Save the structured dataset
with open("../data/structured_constitution_data.json", "w") as file:
    json.dump(structured_data, file, indent=4)

print("../data/Structured Q&A dataset created and saved.")


../data/Structured Q&A dataset created and saved.


In [2]:
pip install transformers datasets


Note: you may need to restart the kernel to use updated packages.


In [3]:
import json
from datasets import Dataset

# Load the structured Q&A dataset
with open("../data/structured_constitution_data.json", "r") as file:
    qa_data = json.load(file)

# Prepare data for training
train_data = []
for paragraph in qa_data["data"]:
    for qas in paragraph["paragraphs"]:
        context = qas["context"]
        for qa in qas["qas"]:
            question = qa["question"]
            answer = qa["answers"][0]["text"]
            answer_start = qa["answers"][0]["answer_start"]
            train_data.append({
                "question": question,
                "context": context,
                "answer": answer,
                "answer_start": answer_start
            })

# Convert to Hugging Face dataset
dataset = Dataset.from_list(train_data)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import BertTokenizerFast



# Use the Fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Define your tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)



Map: 100%|█████████████████████████████████████| 1480/1480 [00:01<00:00, 1376.06 examples/s]


In [5]:
import numpy as np

def find_start_end_positions(examples):
    start_positions = []
    end_positions = []

    for i in range(len(examples['input_ids'])):
        answer_start = examples['offset_mapping'][i][0][0]
        answer_end = answer_start + len(examples['answer'][i])
        start_positions.append(answer_start)
        end_positions.append(answer_end)

    examples['start_positions'] = start_positions
    examples['end_positions'] = end_positions
    return examples

tokenized_dataset = tokenized_dataset.map(find_start_end_positions, batched=True)


Map: 100%|██████████████████████████████████████| 1480/1480 [00:02<00:00, 593.77 examples/s]


In [6]:
pip install --upgrade accelerate

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install --upgrade transformers[torch]


Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.


In [13]:
from transformers import TFBertForQuestionAnswering, Trainer, TrainingArguments

# Load the model
model = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Use "eval_strategy" if you're using a newer version
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
)

# Create a Trainer (This will automatically handle the framework type)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train the model
trainer.train()


All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'TFBertForQuestionAnswering' object has no attribute 'to'