<h1>1. Import Libraries</h1>

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
import time
import numpy as np
import os
import cv2

<h1>2. Data Preprocessing</h1>

In [2]:
# Function to load and preprocess the dataset
def load_data(csv_file, img_dir, transform=None):
    data = pd.read_csv(csv_file)
    images, physical_features, labels = [], [], []

    for idx in range(len(data)):
        img_id = str(data.iloc[idx]['id'])
        img_path = os.path.join(img_dir, f"{img_id}.jpg")
        image = Image.open(img_path).convert("RGB")
        
        if transform:
            image = transform(image)
        
        stable_height = int(data.iloc[idx]['stable_height']) - 1  # zero-based class index
        block_type = data.iloc[idx]['type']
        cam_angle = data.iloc[idx]['cam_angle']

        images.append(image)
        physical_features.append([block_type, cam_angle])
        labels.append(stable_height)
    
    return images, torch.tensor(physical_features, dtype=torch.float32), torch.tensor(labels)


<h1>3. Model Definition</h1>

In [3]:
import torch
import torch.nn as nn
from torchvision import models

# Define the model
class CustomResNetWithFeatures(nn.Module):
    def __init__(self, num_types, num_classes):
        super(CustomResNetWithFeatures, self).__init__()
        self.resnet = models.resnet18(pretrained=True)  # Using ResNet18
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 128)  # Modify ResNet output to 128 features

        # Embedding for block type
        self.type_embedding = nn.Embedding(num_types, 10)  # Embedding for block types
        
        # Fully connected layers for classification
        self.fc1 = nn.Linear(128 + 10 + 1, 64)  # Combine image features, block type embedding, and cam_angle
        self.fc2 = nn.Linear(64, num_classes)  # Final output for classification (logits for each class)

    def forward(self, x, block_type, cam_angle):
        x = self.resnet(x)  # Pass through ResNet
        type_embed = self.type_embedding(block_type)  # Get block type embeddings
        cam_angle = cam_angle.unsqueeze(1)  # Ensure cam_angle has correct shape
        
        # Concatenate features
        combined = torch.cat((x, type_embed, cam_angle), dim=1)
        
        x = torch.relu(self.fc1(combined))  # Pass through the first fully connected layer
        logits = self.fc2(x)  # Get final class logits
        return logits

# Update get_model function to return the model
def get_model(num_types, num_classes):
    return CustomResNetWithFeatures(num_types, num_classes)


<h1>4. Training Function</h1>

In [4]:
def train_model(csv_file_path, images_dir_path, num_types, num_classes=6, num_epochs=6, batch_size=16, model_save_path="model.pth"):
    # Define image transformations
    transform = transforms.Compose([
        transforms.RandomRotation(15),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load data
    images, physical_features, labels = load_data(csv_file_path, images_dir_path, transform)
    dataset = list(zip(images, physical_features, labels))
    
    # Use DataLoader without the need to stack tensors manually
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize model and optimizer
    model = get_model(num_types, num_classes)
    criterion = nn.CrossEntropyLoss()  # For classification
    optimizer = optim.Adam(model.parameters(), lr=0.0002, weight_decay=1e-4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_correct = 0
        model.train()

        for batch in dataloader:
            images, physical_features, stable_heights = batch
            images = images.to(device)  # No need to stack, already a tensor
            block_type = (physical_features[:, 0] - 1).long().to(device)
            cam_angle = physical_features[:, 1].to(device)
            stable_heights = stable_heights.to(device)

            optimizer.zero_grad()
            logits = model(images, block_type, cam_angle)
            loss = criterion(logits, stable_heights)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted_classes = torch.max(logits, 1)
            running_correct += (predicted_classes == stable_heights).sum().item()

        accuracy = running_correct / len(dataloader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader.dataset):.4f}, Accuracy: {accuracy:.4f}")

        # Save the model after every epoch
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved after Epoch {epoch+1} to {model_save_path}")
    
    return model

<h1>5. Saving and Loading Model</h1>

In [5]:
# Function to save the model
def save_model(model, model_path):
    torch.save({'model_state_dict': model.state_dict()}, model_path)

# Function to load the model
def load_model(model_path, num_types):
    model = get_model(num_types, num_classes=6)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # Set to evaluation mode
    return model


<h1>6. Generating Predictions</h1>

In [6]:
def generate_predictions(test_images_dir, model_path, output_csv_file, num_types=2):
    model = load_model(model_path, num_types)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    predictions = []

    for image_name in os.listdir(test_images_dir):
        img_id = os.path.splitext(image_name)[0]
        image_path = os.path.join(test_images_dir, image_name)
        
        image = load_data(image_path).to(device)
        block_type = torch.tensor(1).to(device)
        cam_angle = torch.tensor(0.0).to(device)

        with torch.no_grad():
            output = model(image.unsqueeze(0), block_type.unsqueeze(0), cam_angle.unsqueeze(0))
            predicted_class = output.argmax(dim=1).item() + 1

        predictions.append({'id': img_id, 'stable_height': predicted_class})

    # Save predictions to CSV
    pd.DataFrame(predictions).to_csv(output_csv_file, index=False)


<h1>7. Running the Training</h1>

In [7]:
def load_model(model_save_path, num_types, num_classes):
    # Initialize the model
    model = get_model(num_types, num_classes)
    
    # Load the saved model state dict
    model.load_state_dict(torch.load(model_save_path))
    model.eval()  # Set the model to evaluation mode (important for inference)
    
    return model


In [8]:
csv_file_path = '../COMP90086_2024_Project_train/train.csv'  # Path to your training CSV
images_dir_path = '../COMP90086_2024_Project_train/train'  # Path to your training images
model = train_model(csv_file_path, images_dir_path, num_types=2, num_classes=6, model_save_path='model.pth')




Epoch [1/6], Loss: 0.0904, Accuracy: 0.3764
Model saved after Epoch 1 to model.pth
Epoch [2/6], Loss: 0.0753, Accuracy: 0.5102
Model saved after Epoch 2 to model.pth
Epoch [3/6], Loss: 0.0661, Accuracy: 0.5854
Model saved after Epoch 3 to model.pth
Epoch [4/6], Loss: 0.0553, Accuracy: 0.6589
Model saved after Epoch 4 to model.pth
Epoch [5/6], Loss: 0.0441, Accuracy: 0.7371
Model saved after Epoch 5 to model.pth
Epoch [6/6], Loss: 0.0325, Accuracy: 0.8143
Model saved after Epoch 6 to model.pth


In [11]:
print(model)

CustomResNetWithFeatures(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine

In [12]:
model_ = load_model('model.pth', num_types=2, num_classes=6)

  model.load_state_dict(torch.load(model_save_path))


In [20]:
def load_images_from_folder(folder_path, ids, image_size=(224, 224)):
    images = []
    for id in ids:
        filename = str(int(id)) + '.jpg'

        # Load the image using OpenCV
        img_path = os.path.join(folder_path, filename)
        img = cv2.imread(img_path)

        # Resize the image to the target size (224, 224)
        img = cv2.resize(img, image_size)

        # Convert BGR (OpenCV default) to RGB if needed (TensorFlow models use RGB)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Normalize pixel values to the range [0, 1]
        img_array = img / 255.0

        images.append(img_array)

    # Convert the list of images to a NumPy array with shape (n, 224, 224, 3)
    return np.array(images)

folder_path = '../COMP90086_2024_Project_test/test'
test = np.genfromtxt('../COMP90086_2024_Project_test/test.csv', delimiter=',', skip_header=1)
ids = np.array(test, dtype=int)
test_image_data = load_images_from_folder(folder_path, ids)

# Predict stable height on the test set

[    95    706   2854 ... 998916 999235 999651]


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed