In [1]:
!pip install transformers accelerate scikit-learn wandb

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl (342 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 KB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting wandb
  Downloading wandb-0.19.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21

In [2]:
import requests
import numpy as np
import random
import pickle
import torch
import transformers
import torchvision
import accelerate
import torch.nn as nn
import torchvision.models as models
from torchvision.models import ResNet50_Weights
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import random_split
from transformers import Trainer, TrainingArguments
import os
from torch.utils.data import DataLoader
from torchvision.datasets import VisionDataset
from PIL import Image
import numpy as np
import pickle
import wandb

In [3]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [None]:
os.environ["WANDB_API_KEY"] = ""
os.environ["WANDB_NOTEBOOK_NAME"] = "GIZ_model_trainer"

In [None]:
import os
os.environ["HF_TOKEN"] = ""

Initialize data

In [None]:
from huggingface_hub import hf_hub_download

# Download the file from Hugging Face Hub
hf_hub_download(repo_id="Sunbird/GIZ-buildingsprediction-small", repo_type="dataset", filename="GIZ_model_data.zip", local_dir="./")

GIZ_model_data.zip:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

'GIZ_model_data.zip'

In [None]:
import zipfile
# Extract the zip file
output_dir = "./"  # Specify where to extract
os.makedirs(output_dir, exist_ok=True)

with zipfile.ZipFile("GIZ_model_data.zip", "r") as zip_ref:
    zip_ref.extractall(output_dir)

print(f"Files extracted to {output_dir}")

In [6]:
class CustomImageDataset(VisionDataset):
    def __init__(self, root, geo_metadata_path, transform=None, target_transform=None):
        """
        Args:
            root (str): Path to the directory containing the images and targets.
            geo_metadata_path (str): Path to the pickle file containing geospatial metadata.
            transform (callable, optional): Optional transform to be applied on an image.
            target_transform (callable, optional): Optional transform to be applied on the target.

        Notes:
            - The dataset is split into training and test sets.
            - The training set consists of 80,000 locations, while the test set contains 20,000 locations.
            - Geospatial data (latitude and longitude) is loaded from the provided pickle file.
            - Future improvements could include filtering or grouping data based on geographic regions or using coordinates as input features.
        """
        super().__init__(root, transform=transform, target_transform=target_transform)
        self.data = self._load_data()
        self.geo_metadata = self._load_geo_metadata(geo_metadata_path)

    def _load_data(self):
        """Recursively scans the directory structure and pairs image groups with their targets."""
        data = []
        for dirpath, _, filenames in os.walk(self.root):
            for filename in filenames:
                if filename.endswith("_0.jpg"):
                    base_name = filename.split("_0.jpg")[0]
                    images = [os.path.join(dirpath, f"{base_name}_{i}.jpg") for i in range(4)]  # Adjust if more than 4 images
                    target_file = os.path.join(dirpath, f"{base_name}_target.txt")

                    # Check for missing files
                    missing_files = [img for img in images if not os.path.exists(img)]
                    if missing_files:
                        print(f"Warning: Missing image files: {missing_files}")
                        continue

                    if not os.path.exists(target_file):
                        print(f"Warning: Missing target file: {target_file}")
                        continue

                    data.append((images, target_file, base_name))
        return data

    def _load_geo_metadata(self, geo_metadata_path):
        """Loads geospatial metadata from the pickle file."""
        try:
            with open(geo_metadata_path, 'rb') as f:
                geo_data = pickle.load(f)
                return {str(idx): (lat, lon) for lat, lon, idx in geo_data}
        except Exception as e:
            raise RuntimeError(f"Error loading geospatial metadata: {e}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            images, target_file, base_name = self.data[idx]

            # Load and concatenate images into a single 12-channel tensor
            channels = []
            for img_path in images:
                try:
                    img = Image.open(img_path).convert("RGB")  # Ensure 3-channel image
                except Exception as e:
                    raise RuntimeError(f"Error loading image {img_path}: {e}")

                img = transforms.ToTensor()(img)  # Convert image to tensor (3, H, W)
                channels.append(img)

            input_tensor = torch.cat(channels, dim=0)  # (12, H, W)

            # Apply transforms (including normalization) to final 12-channel tensor
            if self.transform:
                input_tensor = self.transform(input_tensor)  # Apply normalization

            # Load target
            target_path = target_file
            try:
                with open(target_path, 'r') as f:
                    target = float(f.read().strip())
            except Exception as e:
                raise RuntimeError(f"Error loading target file {target_path}: {e}")

            if self.target_transform:
                target = self.target_transform(target)

            # Retrieve geospatial metadata
            geo_metadata = self.geo_metadata.get(base_name)
            if geo_metadata is None:
                raise RuntimeError(f"Geospatial metadata not found for index {base_name}")

            latitude, longitude = geo_metadata

            return {
                "x": input_tensor,  # Normalized 12-channel tensor
                "labels": torch.tensor(target, dtype=torch.float).unsqueeze(-1),
                "coords": [latitude, longitude]
            }

        except Exception as e:
            print(f"Error in __getitem__ for index {idx}: {e}")
            raise
    def extra_repr(self):
        return f"Root: {self.root}, Number of samples: {len(self)}"

In [8]:
# Initialize the dataset (No normalization yet) Data is converted to tensor within the initialization
train_dataset = CustomImageDataset(
    root="./content/exp_set_30_40",
    geo_metadata_path="./train_set_coords.pkl",
    transform=None
)

# Use DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=8)

Compute Means and Stds and normalize

In [9]:
def compute_mean_std(loader):
    """
    Computes mean and standard deviation for a dataset using a DataLoader.

    Args:
        loader (DataLoader): PyTorch DataLoader providing batches of images.

    Returns:
        mean (torch.Tensor): Mean per channel.
        std (torch.Tensor): Standard deviation per channel.
    """
    sum_channels = None
    sum_squares_channels = None
    num_pixels = 0

    print("Computing mean and std...")

    for batch in loader:
        images = batch["x"]  # Extract images from dictionary

        # Ensure images have correct shape: (batch, channels, height, width)
        if images.dim() != 4:
            raise ValueError(f"Expected images of shape (batch, channels, height, width), got {images.shape}")

        batch_size, num_channels, height, width = images.shape

        # Initialize accumulators
        if sum_channels is None:
            sum_channels = torch.zeros(num_channels, dtype=torch.float32)
            sum_squares_channels = torch.zeros(num_channels, dtype=torch.float32)

        # Update accumulators
        sum_channels += images.sum(dim=[0, 2, 3])  # Sum over batch, height, width
        sum_squares_channels += (images ** 2).sum(dim=[0, 2, 3])  # Sum of squares

        # Update total pixel count
        num_pixels += batch_size * height * width

    # Compute final mean and std
    mean = sum_channels / num_pixels
    std = torch.sqrt(sum_squares_channels / num_pixels - mean ** 2)

    return mean, std

# Compute statistics
mean, std = compute_mean_std(train_loader)

# Print results for verification
print(f"Computed Mean: {mean.tolist()}")
print(f"Computed Std: {std.tolist()}")

Computing mean and std...
Computed Mean: [0.1217270940542221, 0.11625519394874573, 0.11000549048185349, 0.011712568812072277, 0.011117534711956978, 0.6160487532615662, 0.5525528192520142, 0.1383216381072998, 0.1876230090856552, 0.07891623675823212, 0.30977097153663635, 0.1511278599500656]
Computed Std: [0.05446215718984604, 0.03039601258933544, 0.03132794052362442, 0.07931023091077805, 0.07740972191095352, 0.04288787022233009, 0.05698736011981964, 0.21387967467308044, 0.14159177243709564, 0.17825061082839966, 0.2660183608531952, 0.13877056539058685]


In [10]:
# Define transformation with computed normalization
final_transform = transforms.Compose([
    transforms.Normalize(mean=mean.tolist(), std=std.tolist())  # Apply computed mean & std
])

# Reinitialize train dataset with normalization
train_dataset = CustomImageDataset(
    root="./content/exp_set_30_40",
    geo_metadata_path="./train_set_coords.pkl",
    transform=final_transform
)

# Reinitialize test dataset (apply SAME normalization values)
test_dataset = CustomImageDataset(
    root="./content/exp_test_set_10_11250",
    geo_metadata_path="./test_set_coords.pkl",
    transform=final_transform  # Use same mean/std from training
)

Load and Train Model

In [14]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torchvision import models, transforms
from transformers import PretrainedConfig

# Define model
def get_modified_resnet50(num_channels=12):
    model = models.resnet50(weights=ResNet50_Weights.DEFAULT)
    # Modify the first convolution layer
    original_conv = model.conv1
    model.conv1 = nn.Conv2d(
        num_channels,
        original_conv.out_channels,
        kernel_size=original_conv.kernel_size,
        stride=original_conv.stride,
        padding=original_conv.padding,
        bias=original_conv.bias
    )
    # Initialize the new conv layer
    nn.init.kaiming_normal_(model.conv1.weight, mode='fan_out', nonlinearity='relu')

    # Modify the final fully connected layer for regression
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, 1)  # Assuming single regression output

    return model

model = get_modified_resnet50().to(device)

# Add a dummy configuration if it doesn't already exist.
if not hasattr(model, "config"):
    model.config = PretrainedConfig()
    # Set the _name_or_path attribute, which the Trainer expects.
    model.config._name_or_path = "GIZ-building-regression-model"

# create custom trainer with huber loss
class RegressionTrainer(transformers.Trainer):
    def __init__(self, *args, **kwargs):
        # Pass standard Trainer args and kwargs to the base Trainer class
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Get the device of the model
        # device = model.device

        # Move inputs to the same device
        images = inputs["x"].to(device)  # Move images to GPU
        labels = inputs["labels"].to(device)  # Move labels to GPU

        # Forward pass
        outputs = model(images)

        # Huber (SmoothL1) loss
        loss_fct = nn.SmoothL1Loss(reduction='mean')

        # Compute loss for each sample
        loss = loss_fct(outputs, labels)  # Loss tensor of shape (batch, channels)
        return (loss, outputs) if return_outputs else loss


REPO_ID = "Sunbird/GIZ-building-regression-model"

# Step 6: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    save_strategy="steps",
    max_steps=5000,
    learning_rate=1e-3,
    label_names=["labels"],
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    dataloader_num_workers=8,
    num_train_epochs=5,
    weight_decay=1e-4,
    logging_dir="./logs",
    report_to="wandb",
    logging_steps=250,
    eval_steps=250,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # Lower MSE is better
    no_cuda=False,  # Ensure Trainer uses GPU if available
    # Hub-related arguments:
    push_to_hub=True,
    hub_model_id=REPO_ID,
    hub_token=os.environ["HF_TOKEN"]
)

# Initialize optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
num_training_steps = training_args.max_steps
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Define Early Stopping Callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.0)

# Step 8: Define the Trainer
trainer = RegressionTrainer(
    model=model,  # The modified ResNet50 model for regression
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping],
)

# Step 9: Train the Model
trainer.train()

Step,Training Loss,Validation Loss
250,0.8907,1.041689
500,0.8205,0.935882
750,0.7945,0.961074
1000,0.795,0.702216
1250,0.7826,0.655841
1500,0.7284,0.647074
1750,0.7738,0.682396
2000,0.7517,0.847725
2250,0.7264,0.844478
2500,0.7324,0.677844


TrainOutput(global_step=2750, training_loss=0.7731716031161222, metrics={'train_runtime': 388.7383, 'train_samples_per_second': 411.588, 'train_steps_per_second': 12.862, 'total_flos': 0.0, 'train_loss': 0.7731716031161222, 'epoch': 8.78594249201278})

In [15]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/Sunbird/GIZ-building-regression-model/commit/96a0ce97b6638c11a18d44b813a571aa31d8fc66', commit_message='End of training', commit_description='', oid='96a0ce97b6638c11a18d44b813a571aa31d8fc66', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sunbird/GIZ-building-regression-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Sunbird/GIZ-building-regression-model'), pr_revision=None, pr_num=None)

In [None]:
import torch

# Example: Create a dummy input with shape (batch_size=1, channels=12, height=224, width=224)
dummy_input = torch.randn(1, 12, 224, 224).to(device)  # Ensure it's on the same device as your model

# Set the model to evaluation mode
model.eval()

# Forward pass with no gradient calculation
with torch.no_grad():
    output = model(dummy_input)

print("Model output:", output)

Model output: tensor([[0.0483]], device='cuda:0')
