In [None]:
pip install transformers pandas tensorflow

In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForQuestionAnswering, BertConfig, pipeline

# Load your dataset
dataset = pd.read_csv("data/training_data.csv", delimiter=';')
questions = dataset["question"].tolist()
answers = dataset["answer"].tolist()

# Compute the length of each answer
answer_lengths = [len(answer) for answer in answers]

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased", max_length=512)


inputs = tokenizer(questions, answers, padding=True, truncation=True, return_tensors="tf")

config = BertConfig.from_pretrained("bert-large-uncased")
config.max_position_embeddings = 512
model = TFBertForQuestionAnswering.from_pretrained("bert-large-uncased", config=config)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn)

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Create tensors of zeroes for start_positions
start_positions = tf.zeros_like(answer_lengths)

# Use answer_lengths to create tensors for end_positions
end_positions = tf.convert_to_tensor(answer_lengths)


model.fit(
    [input_ids, attention_mask],
    {"start_logits": start_positions, "end_logits": end_positions},
    epochs=3,  # Adjust the number of epochs as needed
)

model.save_pretrained("bert_qa_model")