<h1>1. Import Libraries</h1>

In [25]:
import os
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, transforms

<h1>2. Data Preprocessing</h1>

In [26]:
# Function to load and preprocess the dataset
def load_data(csv_file, img_dir, transform=None):
    data = pd.read_csv(csv_file)
    images, physical_features, labels = [], [], []

    for idx in range(len(data)):
        img_id = str(data.iloc[idx]['id'])
        img_path = os.path.join(img_dir, f"{img_id}.jpg")
        image = Image.open(img_path).convert("RGB")
        
        if transform:
            image = transform(image)
        
        stable_height = int(data.iloc[idx]['stable_height']) - 1  # zero-based class index
        block_type = data.iloc[idx]['type']
        cam_angle = data.iloc[idx]['cam_angle']

        images.append(image)
        physical_features.append([block_type, cam_angle])
        labels.append(stable_height)
    
    return images, torch.tensor(physical_features, dtype=torch.float32), torch.tensor(labels)


<h1>3. Model Definition</h1>

In [27]:
# Define the model
class CustomResNetWithFeatures(nn.Module):
    def __init__(self, num_types, num_classes):
        super(CustomResNetWithFeatures, self).__init__()
        self.resnet = models.resnet18(pretrained=True)  # Using ResNet18
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 128)  # Modify ResNet output to 128 features

        # Embedding for block type
        self.type_embedding = nn.Embedding(num_types, 10)  # Embedding for block types
        
        # Fully connected layers for classification
        self.fc1 = nn.Linear(128 + 10 + 1, 64)  # Combine image features, block type embedding, and cam_angle
        self.fc2 = nn.Linear(64, num_classes)  # Final output for classification (logits for each class)

    def forward(self, x, block_type, cam_angle):
        x = self.resnet(x)  # Pass through ResNet
        type_embed = self.type_embedding(block_type)  # Get block type embeddings
        cam_angle = cam_angle.unsqueeze(1)  # Ensure cam_angle has correct shape
        
        # Concatenate features
        combined = torch.cat((x, type_embed, cam_angle), dim=1)
        
        x = torch.relu(self.fc1(combined))  # Pass through the first fully connected layer
        logits = self.fc2(x)  # Get final class logits
        return logits

<h1>4. Training Function</h1>

In [28]:
def train_model(csv_file_path, images_dir_path, num_types, num_classes=6, num_epochs=10, batch_size=16, model_save_path="model.pth"):
    # Define image transformations
    transform = transforms.Compose([
        transforms.RandomRotation(15),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load data
    images, physical_features, labels = load_data(csv_file_path, images_dir_path, transform)
    dataset = list(zip(images, physical_features, labels))
    
    # Use DataLoader without the need to stack tensors manually
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize model and optimizer
    model = CustomResNetWithFeatures(num_types, num_classes)
    criterion = nn.CrossEntropyLoss()  # For classification
    optimizer = optim.Adam(model.parameters(), lr=0.0002, weight_decay=1e-4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_correct = 0
        model.train()

        for batch in dataloader:
            images, physical_features, stable_heights = batch
            images = images.to(device)  # No need to stack, already a tensor
            block_type = (physical_features[:, 0] - 1).long().to(device)
            cam_angle = physical_features[:, 1].to(device)
            stable_heights = stable_heights.to(device)

            optimizer.zero_grad()
            logits = model(images, block_type, cam_angle)
            loss = criterion(logits, stable_heights)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted_classes = torch.max(logits, 1)
            running_correct += (predicted_classes == stable_heights).sum().item()

        accuracy = running_correct / len(dataloader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader.dataset):.4f}, Accuracy: {accuracy:.4f}")

        # Save the model after every epoch
        torch.save(model.state_dict(), f"{model_save_path}_{epoch+1}.pth")
        print(f"Model saved after Epoch {epoch+1} to {model_save_path}")
    
    return model

<h1>7. Running the Training</h1>

In [29]:
def load_model(model_save_path, num_types, num_classes):
    # Initialize the model
    model = CustomResNetWithFeatures(num_types, num_classes)
    
    # Load the saved model state dict
    model.load_state_dict(torch.load(model_save_path))
    model.eval()  # Set the model to evaluation mode (important for inference)
    
    return model


In [30]:
csv_file_path = '../COMP90086_2024_Project_train/train.csv'  # Path to your training CSV
images_dir_path = '../COMP90086_2024_Project_train/train'  # Path to your training images
# model = train_model(csv_file_path, images_dir_path, num_types=2, num_classes=6, model_save_path='models/model') # comment/ uncomment to start training

<h1>Prediction time</h1>

In [32]:
# 2. Preprocess the image
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to match training size
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Apply the same normalization used during training
    ])
    
    # Load image using PIL
    image = Image.open(image_path).convert('RGB')
    
    # Apply transformations and return a batch with a single image (unsqueeze to add batch dimension)
    return transform(image).unsqueeze(0)

# 3. Make a prediction
def predict_image(model, image_tensor, block_type, cam_angle):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    image_tensor = image_tensor.to(device)
    block_type = torch.tensor([block_type]).to(device).long()  # Pass as tensor
    cam_angle = torch.tensor([cam_angle]).to(device).float()   # Pass as tensor
    
    with torch.no_grad():  # Disable gradient calculations for inference
        logits = model(image_tensor, block_type, cam_angle)
        predicted_class = torch.argmax(logits, dim=1).item() + 1  # Convert prediction to class (add 1 for class range [1-6])
    
    return predicted_class

In [36]:
# 4. Example usage
model_path = 'models/model_5.pth'  # Path to the saved model
folder_path = '../COMP90086_2024_Project_test/test'  # Path to the input image
num_types = 2  # Adjust based on your dataset
num_classes = 6  # Number of stable_height classes

# Load model
model = load_model(model_path, num_types, num_classes)

predictions = {"id":[], "stable_height":[]}

  model.load_state_dict(torch.load(model_save_path))


In [37]:
for image in os.listdir(folder_path):
    image_id = image.strip(".jpg")
    image_tensor = preprocess_image(f"{folder_path}/{image}")

    # Predict stable height (assuming default block_type=1, cam_angle=0.0, adjust as needed)
    predicted_class = predict_image(model, image_tensor, block_type=1, cam_angle=0.0)

    # update the dict
    predictions["id"].append(image_id)
    predictions["stable_height"].append(predicted_class)

In [38]:
df = pd.DataFrame(predictions)
df.to_csv("pred.csv", index=False)