In [3]:
import json

# Load the JSON data of the constitution
with open("../data/final_clean.json", "r") as file:
    constitution_data = json.load(file)

# Function to create Q&A pairs based on context
def create_qa_pairs(context):
    # Example questions based on the context
    questions = [
        "What does this section state?",
        "What rights are mentioned in this section?",
        "Who is responsible for the enforcement of this section?",
        "What procedures are outlined in this section?"
    ]
    
    # Create Q&A pairs
    qa_pairs = []
    for question in questions:
        answer_start = context.lower().find("shall")  # Example heuristic for finding answers
        answer = context[answer_start:answer_start + 100] if answer_start != -1 else "Not found"
        qa_pairs.append({
            "question": question,
            "answers": [{"text": answer, "answer_start": answer_start}],
            "id": f"q_{hash(question)}"  # Unique ID for each Q&A
        })
    
    return qa_pairs

# Segment constitution into structured data
structured_data = {"data": [{"title": "Constitution", "paragraphs": []}]}

# Iterate over each page in the constitution
for page in constitution_data["pages"]:
    context = page["cleaned_text"]  # Using cleaned text for Q&A creation
    qa_pairs = create_qa_pairs(context)
    
    # Append the context and Q&A pairs
    structured_data["data"][0]["paragraphs"].append({
        "context": context,
        "qas": qa_pairs
    })

# Save the structured dataset
with open("../data/structured_constitution_data.json", "w") as file:
    json.dump(structured_data, file, indent=4)

print("../data/Structured Q&A dataset created and saved.")


../data/Structured Q&A dataset created and saved.


In [4]:
# ! pip install datasets


In [5]:
import json
from datasets import Dataset

# Load the structured Q&A dataset
with open("../data/structured_constitution_data.json", "r") as file:
    qa_data = json.load(file)

# Prepare data for training
train_data = []
for paragraph in qa_data["data"]:
    for qas in paragraph["paragraphs"]:
        context = qas["context"]
        for qa in qas["qas"]:
            question = qa["question"]
            answer = qa["answers"][0]["text"]
            answer_start = qa["answers"][0]["answer_start"]
            train_data.append({
                "question": question,
                "context": context,
                "answer": answer,
                "answer_start": answer_start
            })

# Convert to Hugging Face dataset
dataset = Dataset.from_list(train_data)


In [6]:
from transformers import BertTokenizerFast



# Use the Fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Define your tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/1480 [00:00<?, ? examples/s]

In [14]:
def find_start_end_positions(examples):
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(examples['offset_mapping']):
        start_char = examples['answers']['answer_start'][i]  # Start of the answer
        end_char = start_char + len(examples['answers']['text'][i])  # End of the answer

        # Initialize start/end tokens as None (will be updated if found)
        start_token_idx = None
        end_token_idx = None

        # Find the token corresponding to the character start/end positions
        for idx, (start_offset, end_offset) in enumerate(offsets):
            if start_offset <= start_char < end_offset:
                start_token_idx = idx
            if start_offset <= end_char <= end_offset:
                end_token_idx = idx
                break

        # Append the token start and end positions
        start_positions.append(start_token_idx)
        end_positions.append(end_token_idx)

    examples['start_positions'] = start_positions
    examples['end_positions'] = end_positions
    return examples

# Apply the function to the dataset
tokenized_dataset = tokenized_dataset.map(find_start_end_positions, batched=True)


Map:   0%|          | 0/1480 [00:00<?, ? examples/s]

KeyError: 'answers'

In [8]:
# pip install --upgrade accelerate

In [9]:
# pip install --upgrade transformers[torch]


In [None]:
# pip install tensorflow


In [11]:
# ! pip install transformers

In [15]:
import tensorflow as tf
from transformers import TFBertForQuestionAnswering
from datasets import load_dataset
from transformers import DefaultDataCollator

# Load and tokenize the dataset
# Assuming your dataset is already tokenized and formatted
# You can adjust `load_dataset` to fit your specific dataset
dataset = load_dataset("squad")

# Define the data collator
data_collator = DefaultDataCollator(return_tensors="tf")

# Convert the dataset to TensorFlow format
train_dataset = dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "start_positions", "end_positions"],
    label_cols=["start_positions", "end_positions"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator
)

# Load the TensorFlow BERT model for Question Answering
model = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)

# Train the model using model.fit()
model.fit(train_dataset, epochs=3)


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

ValueError: Column input_ids not found in dataset!

In [13]:
import tensorflow as tf
from transformers import TFBertForQuestionAnswering
from transformers import BertTokenizer

# Load the TensorFlow model and tokenizer
model = TFBertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Compile the model with a loss function and optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)

# Prepare the dataset (replace with your tokenized dataset)
train_dataset = tokenized_dataset  # Ensure it's a TensorFlow dataset (tf.data.Dataset)

# Train the model using model.fit()
model.fit(train_dataset.shuffle(1000).batch(8), epochs=3)


All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batching examples:   0%|          | 0/1480 [00:00<?, ? examples/s]

ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>