In [None]:
class_id_to_name = {
    "0": "air_conditioner",
    "1": "car_horn",
    "2": "children_playing",
    "3": "dog_bark",
    "4": "drilling",
    "5": "engine_idling",
    "6": "gun_shot",
    "7": "jackhammer",
    "8": "siren",
    "9": "street_music"
}


In [2]:
import os
import json

# Define the base directory where the folds are located
base_dir = "../UrbanSound-Spectrogram"

# Define the output file
output_file = "urbansound_dataset.jsonl"

# Open the output file for writing
with open(output_file, "w") as outfile:
    # Iterate through each fold directory
    for fold in range(1, 11):
        fold_dir = os.path.join(base_dir, f"fold{fold}")
        # Iterate through each file in the fold directory
        for filename in os.listdir(fold_dir):
            if filename.endswith(".png"):
                # Extract classID from the filename
                class_id = filename.split("-")[1]
                # Map classID to class name
                label = class_id_to_name.get(class_id, "unknown")
                # Construct the full path to the image
                image_path = os.path.join(fold_dir, filename)
                # Create the data entry
                data_entry = {
                    "image": image_path,
                    "text": "Classify the sound in this spectrogram.",
                    "label": label
                }
                # Write the JSON line to the output file
                outfile.write(json.dumps(data_entry) + "\n")


FileNotFoundError: [Errno 2] No such file or directory: '../UrbanSound-Spectrogram/fold2'

In [6]:
# Load model directly
import torch
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
from PIL import Image

processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base", torch_dtype="auto").to("cuda", torch.float16)

raw_image = Image.open("../UrbanSound-Spectrogram/fold1/98223-7-2-0.png").convert('RGB')

question = "Classify the sound in this spectrogram ? Is it an air conditioner, car horn, children playing, dog bark, drilling, engine idling, gun shot, jackhammer, siren or street music ?"
inputs = processor(raw_image, question, return_tensors="pt").to("cuda", torch.float16)

output = model.generate(**inputs)
print(output)
print(processor.decode(output[0], skip_special_tokens=True))



tensor([[30522,  3751,   102]], device='cuda:0')
electric


In [None]:
from datasets import load_dataset
import torch
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from PIL import Image
import numpy as np
from tqdm.auto import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Load the dataset
dataset = load_dataset('json', data_files='urbansound_dataset.jsonl', split='train')

# Split the dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
val_dataset = dataset['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

# Load the processor and model
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = AutoModelForVisualQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base", 
    torch_dtype=torch.float16
)

# Define LoRA Configuration
lora_config = LoraConfig(
    r=16,  # dimension of the low-rank matrices
    lora_alpha=32,  # scaling factor
    target_modules=["query", ""],  # layers to apply LoRA
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.QUESTION_ANS
)

# Prepare model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Prints the percentage of trainable parameters

# Define data preprocessing function
def preprocess_function(examples):
    questions = ["Classify the sound in this spectrogram."] * len(examples['image'])
    images = [Image.open(image_path).convert('RGB') for image_path in examples['image']]
    labels = examples['label']
    
    # Encode the inputs
    inputs = processor(
        images=images, 
        text=questions, 
        return_tensors="pt", 
        padding="max_length",  # Use max_length padding
        max_length=32,  # Set a fixed max length for input ids
        truncation=True
    )
    
    # Encode the labels with fixed length
    encoded_labels = processor.tokenizer(
        labels, 
        return_tensors="pt", 
        padding="max_length", 
        max_length=8,  # Set a fixed max length for labels
        truncation=True
    ).input_ids
    
    # Remove special tokens (BOS/EOS)
    encoded_labels = encoded_labels[:, 1:-1] 
    
    # Create attention mask for labels
    label_attention_mask = (encoded_labels != 0).long()
    
    inputs["labels"] = encoded_labels
    inputs["label_attention_mask"] = label_attention_mask
    
    return inputs
    
    


# Preprocess the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, batch_size=8)
val_dataset = val_dataset.map(preprocess_function, batched=True, batch_size=8)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['pixel_values', 'input_ids', 'attention_mask', 'labels', 'label_attention_mask'])
val_dataset.set_format(type='torch', columns=['pixel_values', 'input_ids', 'attention_mask', 'labels', 'label_attention_mask'])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Training arguments
training_args = TrainingArguments(
    output_dir="./blip-vqa-lora-urbansound",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,  # Use mixed precision training
)

# Define custom trainer to handle the sequence generation
class CustomTrainer:
    def __init__(self, model, args, train_dataloader, val_dataloader, optimizer):
        self.model = model
        self.args = args
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.optimizer = optimizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
    
    def train(self):
        self.model.train()
        for epoch in range(int(self.args.num_train_epochs)):
            print(f"Epoch {epoch+1}/{self.args.num_train_epochs}")
            total_loss = 0
            progress_bar = tqdm(self.train_dataloader)
            
            for batch in progress_bar:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                self.optimizer.zero_grad()
                
                # Forward pass
                outputs = self.model(
                    pixel_values=batch["pixel_values"],
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                
                loss = outputs.loss
                total_loss += loss.item()
                
                # Backward pass
                loss.backward()
                self.optimizer.step()
                
                progress_bar.set_description(f"Loss: {loss.item():.4f}")
            
            avg_loss = total_loss / len(self.train_dataloader)
            print(f"Average training loss: {avg_loss:.4f}")
            
            # Evaluation
            eval_results = self.evaluate()
            print(f"Validation Loss: {eval_results['loss']:.4f}, Accuracy: {eval_results['accuracy']:.4f}")
            
            # Save checkpoint
            torch.save(self.model.state_dict(), f"{self.args.output_dir}/model_epoch_{epoch+1}.pt")
    
    def evaluate(self):
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_dataloader, desc="Evaluating"):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                
                outputs = self.model(
                    pixel_values=batch["pixel_values"],
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
                
                loss = outputs.loss
                total_loss += loss.item()
                
                # Generate predictions
                generated_ids = self.model.generate(
                    pixel_values=batch["pixel_values"],
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    max_length=20
                )
                
                pred_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
                label_texts = processor.batch_decode(batch["labels"], skip_special_tokens=True)
                
                all_preds.extend(pred_texts)
                all_labels.extend(label_texts)
        
        # Calculate accuracy
        correct = sum(1 for pred, label in zip(all_preds, all_labels) if pred.strip() == label.strip())
        accuracy = correct / len(all_labels)
        
        return {
            "loss": total_loss / len(self.val_dataloader),
            "accuracy": accuracy
        }

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

# Create and run the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer
)

# Start training
trainer.train()

# Save the final model
model.save_pretrained(f"{training_args.output_dir}/final_model")

# Test the model on a sample image
def test_model(image_path, question):
    # Load the fine-tuned model
    fine_tuned_model = AutoModelForVisualQuestionAnswering.from_pretrained(
        f"{training_args.output_dir}/final_model",
        torch_dtype=torch.float16
    ).to("cuda")
    
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, question, return_tensors="pt").to("cuda", torch.float16)
    
    output = fine_tuned_model.generate(**inputs)
    answer = processor.decode(output[0], skip_special_tokens=True)
    
    return answer

# Test the model on a sample
sample_image = "../UrbanSound-Spectrogram/fold1/98223-7-2-0.png"
question = "Classify the sound in this spectrogram."
prediction = test_model(sample_image, question)
print(f"Prediction: {prediction}")

Training examples: 698
Validation examples: 175
trainable params: 1,179,648 || all params: 385,852,220 || trainable%: 0.3057


Map: 100%|██████████| 698/698 [07:07<00:00,  1.63 examples/s]
Map: 100%|██████████| 175/175 [01:47<00:00,  1.63 examples/s]


Epoch 1/3


  0%|          | 0/175 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [4] at entry 0 and [3] at entry 3

In [9]:
# Add this code to identify target modules in the model
def find_target_modules(model):
    """Helper function to find potential target modules for LoRA."""
    target_modules = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if 'query' in name or 'key' in name or 'value' in name:
                parent_name = '.'.join(name.split('.')[:-1])
                target_modules.add(parent_name)
                print(f"Found attention module: {name}")
    
    # Also print some example module names to help identify patterns
    print("\nSample of module names (first 20):")
    for i, (name, _) in enumerate(model.named_modules()):
        if i < 20:
            print(name)
        else:
            break
    
    return target_modules

# Check the model architecture to identify the correct target modules
find_target_modules(model)

# Now update the LoRA configuration with correct target modules
# For BLIP models, the target modules are likely to be something like:
lora_config = LoraConfig(
    r=16,  # dimension of the low-rank matrices
    lora_alpha=32,  # scaling factor
    target_modules=["query", "value"],  # Update these based on the output above
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Prepare model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Found attention module: text_encoder.encoder.layer.0.attention.self.query
Found attention module: text_encoder.encoder.layer.0.attention.self.key
Found attention module: text_encoder.encoder.layer.0.attention.self.value
Found attention module: text_encoder.encoder.layer.0.crossattention.self.query
Found attention module: text_encoder.encoder.layer.0.crossattention.self.key
Found attention module: text_encoder.encoder.layer.0.crossattention.self.value
Found attention module: text_encoder.encoder.layer.1.attention.self.query
Found attention module: text_encoder.encoder.layer.1.attention.self.key
Found attention module: text_encoder.encoder.layer.1.attention.self.value
Found attention module: text_encoder.encoder.layer.1.crossattention.self.query
Found attention module: text_encoder.encoder.layer.1.crossattention.self.key
Found attention module: text_encoder.encoder.layer.1.crossattention.self.value
Found attention module: text_encoder.encoder.layer.2.attention.self.query
Found attention 