In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip3 install bitsandbytes peft trl

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset, Dataset
import torch
from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
EPOCHS = 1

BATCH_SIZE = 1 #low due to lack of hardware to support
GRADIENT_CHECKPOINTING = True #improves memory efficiency, but increases computation time
USE_REENTRANT = False
OPTIM = "paged_adamw_32bit"
LEARNING_RATE = 2e-5
LOGGING_STEPS = 50
EVAL_STEPS = 50
SAVE_STEPS = 50
EVAL_STRATEGY = "steps"
SAVE_STRATEGY = "steps"
METRIC_FOR_BEST_MODEL = "eval_loss"
LOAD_BEST_MODEL_AT_END = True
MAX_GRAD_NORM = 1
WARMUP_STEPS = 0
DATASET_KWARGS = {"skip_prepare_datset" : True} #no need to prepare data as it will be done already
REMOVE_UNUSED_COLUMNS = False #Necessary for VLMS
MAX_SEQ_LEN = 128
DATASET_LENGTH = 283
NUM_STEPS = (DATASET_LENGTH // BATCH_SIZE) * EPOCHS
print(NUM_STEPS)

In [None]:
system_message = """You are a multimodal driving-scene reasoning assistant for autonomous driving systems.

You are provided with:
- Multiple synchronized camera images from different viewpoints (e.g., front, rear, left, right).
- LiDAR data describing the 3D structure of the scene.
- A natural-language question about the driving environment.

Your task is to answer the question by jointly reasoning over ALL available modalities.

You must:
- Fuse information across all camera views and the LiDAR data to form a coherent understanding of the scene.
- Use camera images to identify visual attributes such as objects, signs, signals, weather, lighting, and semantics.
- Use LiDAR data to reason about 3D structure, distance, relative position, size, motion cues, and occlusions.
- Cross-check information between modalities when possible.
- Prioritize safety-aware and conservative reasoning when uncertainty exists.

You must NOT:
- Assume viewpoints, objects, or measurements not supported by the inputs.
- Hallucinate depth, distance, or object presence without LiDAR or clear visual evidence.
- Ignore any modality unless it is missing or explicitly empty.

Answering rules:
- Answer concisely and directly.
- Ground every conclusion in observable evidence from the inputs.
- Do not mention internal reasoning, model details, or the prompt itself."""



def format_data(sample):
    """
    Formats a NuScenes QA sample with multi-camera images and LiDAR data
    for Qwen3-8B-Instruct multimodal training or inference.

    Args:
        sample (dict): One dataset row with required headers

    Returns:
        dict: Chat-formatted multimodal example
    """

    user_content = []

    # --- Camera views (ordered, explicit) ---
    camera_keys = [
        "CAM_FRONT",
        "CAM_FRONT_RIGHT",
        "CAM_BACK_RIGHT",
        "CAM_BACK",
        "CAM_BACK_LEFT",
        "CAM_FRONT_LEFT",
    ]

    for cam in camera_keys:
        if cam in sample and sample[cam] is not None:
            user_content.append({
                "type": "image",
                "image": Image.fromarray(np.array(sample[cam]), 'RGB'),
                "view": cam
            })

    # --- LiDAR input ---
    if "LIDAR_TOP" in sample and sample["LIDAR_TOP"] is not None:
        user_content.append({
            "type": "lidar",
            "lidar": sample["LIDAR_TOP"],
            "view": "LIDAR_TOP"
        })

    # --- Question ---
    user_content.append({
        "type": "text",
        "text": sample["question"]
    })

    messages = [
        {
            "role": "system",
            "content": system_message
        },
        {
            "role": "user",
            "content": user_content
        }
    ]

    # --- Assistant answer (training only) ---
    if "answer" in sample and sample["answer"] is not None:
        messages.append({
            "role": "assistant",
            "content": sample["answer"]
        })

    return {"messages": messages}
    # return messages


In [None]:
train_stream = load_dataset(
    "KevinNotSmile/nuscenes-qa-mini",
    "night",
    split="train",
    streaming=True,
)
train_dataset = Dataset.from_generator(lambda: train_stream.take(3))
print(len(train_dataset))

In [None]:
eval_stream = load_dataset(
    "KevinNotSmile/nuscenes-qa-mini",
    "night",
    split="validation",
    streaming=True,
)
eval_dataset = Dataset.from_generator(lambda: eval_stream.take(1))
print(len(eval_dataset))

In [None]:
print(train_dataset)
print('-'*30)
print(eval_dataset)
print('-'*30)
# print(train_dataset[0])
# print('-'*30)
# print(eval_dataset[0])
# print('-'*30)

In [None]:
import math
train_dataset_formated = [format_data(sample) for sample in train_dataset]
eval_dataset_formated = [format_data(sample) for sample in eval_dataset]
test_dataset_formated = train_dataset_formated[math.ceil(len(train_dataset_formated) * 0.66):]
train_dataset_formated = train_dataset_formated[:math.ceil(len(train_dataset_formated) * 0.66)]
print(len(train_dataset_formated))
print(len(eval_dataset_formated))
print(len(test_dataset_formated))

In [None]:
print(train_dataset_formated[0]['messages'][1]['content'][5]['view']) #should print "CAM_FRONT_LEFT"
print(eval_dataset_formated[0]['messages'][1]['content'][5]['view']) #print CAM_FRONT_LEFT

In [None]:
if device == "cuda":
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_use_double_quant = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model  = Qwen3VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map = "auto",
        quantization_config = bnb_config
    )

else:
    model  = Qwen3VLForConditionalGeneration.from_pretrained(
        MODEL_ID
    )

processor = Qwen3VLProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right"


In [None]:
sample_data = test_dataset_formated[0]
# sample_question = sample_data["messages"][1]["content"][7]["text"]
# sample_answer = sample_data["messages"][2]["content"]
# sample_image = sample_data["messages"][1]["content"][:5]
# sample_lidar = sample_data["messages"][1]["content"][:5]

# print(sample_question)
# print(sample_answer)



In [None]:
def text_generator(sample_data, add_generation_prompt=True):
    """
    Converts a formatted multimodal chat sample into model-ready text
    using the processor's chat template.

    Args:
        sample_data (dict): Must contain "messages"
        processor: Qwen processor/tokenizer
        add_generation_prompt (bool): 
            - False for training
            - True for inference

    Returns:
        str: Formatted text prompt
    """

    text = processor.apply_chat_template(
        sample_data["messages"][0:2],
        tokenize=False,
        add_generation_prompt=add_generation_prompt
    )

    # print(f"prompt: {text}")

    image_inputs = [views['image'] for views in sample_data["messages"][1]["content"][:6]]

    inputs = processor(
        text = [text],
        images = [image_inputs],
        return_tensors='pt'
    )

    inputs = inputs.to(device)

    generated_ids = model.generate(**inputs, max_new_tokens = MAX_SEQ_LEN)

    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True
    )

    del inputs

    actual_answer = sample_data["messages"][2]["content"]
    

    return output_text[0], actual_answer

generated_text, actual_answer = text_generator(sample_data)
print(f"Generated Answer: {generated_text}")
print(f"Actual Answer: {actual_answer}")

In [None]:
print(train_dataset_formated)

In [None]:
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.1,
    r=8,
    bias = 'none',
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

print(f"Before adapter paramters: {model.num_parameters()}")
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()


In [None]:
training_args = SFTConfig(
    output_dir="./output",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_checkpointing=GRADIENT_CHECKPOINTING,
    learning_rate=LEARNING_RATE,
    logging_steps=LOGGING_STEPS,
    eval_steps=EVAL_STEPS,
    eval_strategy=EVAL_STRATEGY,
    save_strategy=SAVE_STRATEGY,
    save_steps=SAVE_STEPS,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
    max_grad_norm=MAX_GRAD_NORM,
    warmup_steps=WARMUP_STEPS,
    dataset_kwargs=DATASET_KWARGS,
    max_length=MAX_SEQ_LEN,
    remove_unused_columns = REMOVE_UNUSED_COLUMNS,
    optim=OPTIM,
)

In [None]:
collate_sample = [train_dataset_formated[0]]

def collate_fn(examples):
    texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
    image_inputs = [[views['image'] for views in example["messages"][1]["content"][:6]] for example in examples]
    batch = processor(
        text=texts, images=image_inputs, return_tensors = "pt", padding = True,
    )

    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = batch["input_ids"]

    return batch

collated_data = collate_fn(collate_sample)
print(collated_data.keys())

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=[data["messages"] for data in train_dataset_formated],
    eval_dataset=[data["messages"] for data in eval_dataset_formated],
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor.tokenizer,
)

In [None]:
print("Initial Evaluation")
metric = trainer.evaluate()
print(metric)

print("Training")
trainer.train()
