# Intersection Conflict Detection Notebook

This notebook demonstrates loading the synthetic vehicle dataset, preparing inputs and labels, tokenizing, and setting up fine-tuning of an open-source LLM (e.g., LLaMA 2-7B) for traffic conflict detection.

In [None]:
# Install necessary packages (run once)
!pip install transformers datasets peft accelerate torch

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch

## 1. Load the dataset

In [None]:
# Load synthetic vehicle scenario dataset
data = pd.read_csv('data/generated_dataset.csv')
data.head()

## 2. Prepare input-output pairs

In [None]:
# Example: concatenate features into text input
def prepare_input(row):
    return f'Vehicle {row.vehicle_id} in lane {row.lane} moving {row.speed} km/h towards {row.destination}, distance to intersection {row.distance_to_intersection} m, direction {row.direction}'

# Target label: is_conflict + decisions
def prepare_output(row):
    return f'Conflict: {row.is_conflict}. Decision: {row.decisions}'

# Apply functions
data['input_text'] = data.apply(prepare_input, axis=1)
data['output_text'] = data.apply(prepare_output, axis=1)

## 3. Convert to Hugging Face Dataset

In [None]:
hf_dataset = Dataset.from_pandas(data[['input_text', 'output_text']])
hf_dataset = hf_dataset.train_test_split(test_size=0.1)
hf_dataset

## 4. Load a free open-source LLM

In [None]:
# Using LLaMA 2-7B (or any smaller compatible model)
model_name = 'meta-llama/Llama-2-7b-hf'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')

## 5. Tokenize the dataset

In [None]:
def tokenize_fn(batch):
    return tokenizer(batch['input_text'], text_target=batch['output_text'], padding='max_length', truncation=True, max_length=256)

tokenized_ds = hf_dataset.map(tokenize_fn, batched=True)

## 6. Set up training

In [None]:
training_args = TrainingArguments(
    output_dir='./llm_vehicle_model',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=500,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=50,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    tokenizer=tokenizer
)

## 7. Start training (fine-tuning)

In [None]:
trainer.train()

## 8. Evaluate model predictions

In [None]:
example = 'Vehicle V7657 in lane 6 moving 62 km/h towards A, distance to intersection 319 m, direction south'
inputs = tokenizer(example, return_tensors='pt').to(model.device)
output_tokens = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output_tokens[0], skip_special_tokens=True))