In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [12]:
pip install wandb



In [13]:
# Fine-tuning BERT on wildfire/fire safety Q/A (SQuAD-style) using HuggingFace
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import pandas as pd
import torch

In [14]:
# Load CSV dataset
csv_path = "real_wildfire_qa_200.csv"  # update path if needed
df = pd.read_csv(csv_path)

# Convert to HuggingFace Dataset format
hf_dataset = Dataset.from_pandas(df)

# Load tokenizer and model (no login required)
model_checkpoint = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Function to prepare tokenized inputs with start/end positions
def preprocess(example):
    inputs = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
        return_offsets_mapping=True,
        return_tensors="pt"
    )
    answer = example["answer"]
    context = example["context"]
    start_char = context.find(answer)
    end_char = start_char + len(answer)

    if start_char == -1:
        inputs["start_positions"] = torch.tensor([0])
        inputs["end_positions"] = torch.tensor([0])
    else:
        offsets = inputs["offset_mapping"][0]
        start_token = end_token = 0
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
                break
        inputs["start_positions"] = torch.tensor([start_token])
        inputs["end_positions"] = torch.tensor([end_token])

    inputs.pop("offset_mapping")
    return {key: val.squeeze(0) for key, val in inputs.items()}

In [16]:
# Preprocess dataset
encoded_dataset = hf_dataset.map(preprocess)

# Training configuration
training_args = TrainingArguments(
    output_dir="bert_qa_model",
    eval_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="logs",
    save_strategy="epoch",
    report_to='none'
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [17]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset
)

In [19]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=100, training_loss=0.7432527923583985, metrics={'train_runtime': 97.4504, 'train_samples_per_second': 8.209, 'train_steps_per_second': 1.026, 'total_flos': 156778054041600.0, 'train_loss': 0.7432527923583985, 'epoch': 4.0})

In [20]:
# Save final model
model.save_pretrained("./bert_qa_final")
tokenizer.save_pretrained("./bert_qa_final")

('./bert_qa_final/tokenizer_config.json',
 './bert_qa_final/special_tokens_map.json',
 './bert_qa_final/vocab.txt',
 './bert_qa_final/added_tokens.json',
 './bert_qa_final/tokenizer.json')

In [22]:
from transformers import BertForQuestionAnswering, BertTokenizerFast
import torch

# Load fine-tuned model
model = BertForQuestionAnswering.from_pretrained("./bert_qa_final")
tokenizer = BertTokenizerFast.from_pretrained("./bert_qa_final")

# Example input
context = "Wildfires are uncontrolled fires that often start in forests or grasslands. They can spread rapidly due to wind and dry vegetation. Human negligence is a leading cause of wildfires."
question = "How to avoid wildfires?"

# Encode input
inputs = tokenizer(question, context, return_tensors="pt")

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx]))

print("Q:", question)
print("A:", answer)

Q: How to avoid wildfires?
A: human negligence
