In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/music-test/music_test/S1m40_03.png
/kaggle/input/music-test/music_test/S1m21_05.png
/kaggle/input/music-test/music_test/S1m12_06.png
/kaggle/input/music-test/music_test/S1m61_02.png
/kaggle/input/music-test/music_test/S1m39_05.png
/kaggle/input/music-test/music_test/S1m20_00.png
/kaggle/input/music-test/music_test/S1m67_07.png
/kaggle/input/music-test/music_test/S1m52_04.png
/kaggle/input/music-test/music_test/S1m21_07.png
/kaggle/input/music-test/music_test/S1m28_06.png
/kaggle/input/music-test/music_test/S1m73_04.png
/kaggle/input/music-test/music_test/S1m25_00.png
/kaggle/input/music-test/music_test/S1m34_07.png
/kaggle/input/music-test/music_test/S1m04_06.png
/kaggle/input/music-test/music_test/S1m78_05.png
/kaggle/input/music-test/music_test/S1m70_04.png
/kaggle/input/music-test/music_test/S1m74_05.png
/kaggle/input/music-test/music_test/S1m07_07.png
/kaggle/input/music-test/music_test/S1m20_06.png
/kaggle/input/music-test/music_test/S1m10_00.png
/kaggle/input/music-

In [5]:
import torch
import torch.nn as nn
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import os

# --- 1. Dataset Configuration ---
class MusicScaleDataset(Dataset):
    """
    Custom Dataset class for loading music spectrogram images and their corresponding labels.
    """
    def __init__(self, csv_file, img_dir, transform=None):
        # Load the ground truth CSV containing filenames and category labels
        self.data_frame = pd.read_csv(csv_file)
        # Root directory for image files
        self.img_dir = img_dir
        # Image transformations to be applied
        self.transform = transform

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.data_frame)

    def __getitem__(self, idx):
        # Construct the full image path using the filename from the CSV
        img_name = os.path.join(self.img_dir, self.data_frame.iloc[idx, 0])
        # Open the image and ensure it is in RGB format
        image = Image.open(img_name).convert('RGB')
        
        # Extract the integer label representing the musical note (0-87)
        label = int(self.data_frame.iloc[idx, 1])
        
        # Apply transformations if provided
        if self.transform:
            image = self.transform(image)
            
        return image, label

In [6]:
# --- 2. Image Preprocessing ---
# Define the transformation pipeline for the ResNet model
transform = transforms.Compose([
    transforms.Resize((224, 224)),      # Standard size for ResNet input
    transforms.ToTensor(),              # Convert PIL Image to PyTorch Tensor
    transforms.Normalize(               # Normalize using ImageNet statistics
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    )
])

# Initialize the training dataset with Kaggle input paths
train_dataset = MusicScaleDataset(
    csv_file='/kaggle/input/music-scale-recognition-by-cnn/train_truth.csv', 
    img_dir='/kaggle/input/music-train/music_train',
    transform=transform
)

# Initialize DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [8]:
# --- 3. Model Definition (Transfer Learning) ---
from torchvision.models import ResNet18_Weights

# Load pre-trained ResNet-18 weights to utilize established feature extractors
model = models.resnet18(weights=ResNet18_Weights.DEFAULT)

# Modify the final Fully Connected layer to output 88 classes (piano notes)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 88)

# Detect if GPU is available and move the model to the target device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Device detected:", device)

Device detected: cpu


In [10]:
# --- 4. Training Parameters ---
import torch.optim as optim
# Loss function for multi-class classification
criterion = nn.CrossEntropyLoss()
# Adam optimizer with a learning rate of 0.0001 for stable convergence
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [11]:
# --- 5. Training Loop ---
import torch.optim as optim

def train(epochs):
    model.train() # Set the model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            # Transfer data to GPU/CPU
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()           # Reset gradients from the previous step
            outputs = model(images)         # Forward pass
            loss = criterion(outputs, labels) # Calculate loss
            loss.backward()                 # Backward pass (backpropagation)
            optimizer.step()                # Update weights
            
            running_loss += loss.item()
        
        # Log average loss per epoch
        print(f"Epoch {epoch+1}, Average Loss: {running_loss/len(train_loader):.4f}")

In [12]:
# --- 6. Inference and Submission Generation ---
def predict_and_save(test_img_dir, sample_csv):
    model.eval() # Set the model to evaluation mode
    sample_df = pd.read_csv(sample_csv)
    predictions = []
    
    with torch.no_grad(): # Disable gradient calculation for efficiency
        # Predict labels following the exact order of the sample CSV
        for filename in sample_df['filename']:
            img_path = os.path.join(test_img_dir, filename)
            img = Image.open(img_path).convert('RGB')
            img = transform(img).unsqueeze(0).to(device) # Prepare batch dimension
            
            output = model(img)
            _, predicted = torch.max(output, 1) # Pick the class with the highest probability
            predictions.append(predicted.item())
            
    # Save the final results to a CSV for Kaggle submission
    sample_df['category'] = predictions
    sample_df.to_csv('CNN_submission.csv', index=False)
    print("Submission file 'CNN_submission.csv' generated successfully!")

In [None]:
# --- Execution ---
train(epochs=10)

predict_and_save(
    test_img_dir='/kaggle/input/music-test/music_test',
    sample_csv='/kaggle/input/music-scale-recognition-by-cnn/sample_truth.csv'
)