In [3]:
import os
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Subset
from torchvision.io import read_image
import torchvision.io as io
import torchvision.models as models
import torch.nn.functional as F
from torchvision.models import resnet18, resnet101
from torchvision.models import ResNet18_Weights, ResNet101_Weights
from tqdm import tqdm
from sklearn.model_selection import train_test_split



In [10]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,latitude,longitude,image_name,country_code,country,continent,region,alpha-2,sub-region,is_augmented,aumentation_source_image_name
0,45.60322,15.538784,3388.png,HR,Croatia,Europe,Croatia,HR,Southern Europe,False,3388.png
1,7.382881,3.67338,2963.png,NG,Nigeria,Africa,Nigeria,NG,Sub-Saharan Africa,False,2963.png
2,1.40036,103.8941,14138.png,MY,Malaysia,Asia,Malaysia,MY,South-eastern Asia,False,14138.png
3,53.94939,-7.84681,23442.png,IE,Ireland,Europe,Ireland,IE,Northern Europe,False,23442.png
4,32.1864,34.8645,20450.png,IL,Israel,Asia,Israel,IL,Western Asia,False,20450.png


In [17]:
####################################
# 1) Class for using a dataset, including limiting the number of images used
####################################
class CustomDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None, limit=None, augmented_data=False):
        self.data = pd.read_csv(csv_file)
        if not augmented_data and "is_augmented" in self.data.columns:
            self.data = self.data[self.data['is_augmented']==False]

        if limit:  # Limit the dataset to a small number of observations
            self.data = self.data.head(limit)
        self.root_dir = root_dir
        self.transform = transform

        # Mapping sub-region strings to integer labels
        self.subregion_mapping = {
            subregion: idx
            for idx, subregion in enumerate(self.data['sub-region'].unique())
        }
        self.data['subregion_label'] = self.data['sub-region'].map(self.subregion_mapping)

        self.missing_files = []  # List to log missing files

        print(f"Dataset initialized with {len(self.data)} samples.")
        print(f"Sub-regions mapped: {self.subregion_mapping}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['image_name'])
        if not os.path.exists(img_path):
            self.missing_files.append(img_path)
            return None  # Skip this sample

        image = io.read_image(img_path)
        if self.transform:
            image = self.transform(image)
        label = self.data.iloc[idx]['subregion_label']

        # Debug: print shape for the first item
        # if idx == 0:
        #     print(f"Sample image shape: {image.shape}, Label: {label}")

        return image, label


def collate_fn(batch):
    # Filter out None values
    batch = [item for item in batch if item is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.default_collate(batch)

####################################
# 2) TRAIN FUNCTION (with softmax)
####################################
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    train_loss = 0
    correct = 0

    # We'll store raw predictions for final column, plus softmax probabilities
    predictions = []
    probabilities_list = []

    with tqdm(train_loader, desc="Training", unit="batch") as pbar:
        for batch in pbar:
            if batch is None:  # Skip if batch is empty
                continue
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            # Predictions
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()
            predictions.extend(preds.cpu().numpy())

            # Softmax probabilities
            probs = torch.softmax(outputs, dim=1)
            probabilities_list.append(probs.detach().cpu().numpy())

            pbar.set_postfix(loss=loss.item())

    # Combine probabilities from all batches into one array
    probabilities_array = np.concatenate(probabilities_list, axis=0)

    train_accuracy = correct / len(train_loader.dataset)
    avg_loss = train_loss / len(train_loader)

    return avg_loss, train_accuracy, predictions, probabilities_array

####################################
# 3) TEST FUNCTION 
####################################
def test(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    correct = 0

    predictions = []
    probabilities_list = []

    with tqdm(test_loader, desc="Testing", unit="batch") as pbar:
        for batch in pbar:
            if batch is None:
                continue

            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            test_loss += loss.item()

            # Predictions
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()
            predictions.extend(preds.cpu().numpy())

            # Softmax probabilities
            probs = torch.softmax(outputs, dim=1)
            probabilities_list.append(probs.detach().cpu().numpy())

            pbar.set_postfix(loss=loss.item())

    # Combine probabilities
    probabilities_array = np.concatenate(probabilities_list, axis=0)

    test_accuracy = correct / len(test_loader.dataset)
    avg_loss = test_loss / len(test_loader)

    return avg_loss, test_accuracy, predictions, probabilities_array

####################################
# 4) SAVE EPOCH FUNCTION
####################################
def save_epoch(
    model_name,
    model,
    train_loss,
    test_loss,
    train_accuracy,
    test_accuracy,
    train_predictions,
    test_predictions,
    train_probabilities,     
    test_probabilities,      
    train_df,
    test_df,
    subregion_mapping,
    save_weights=False
):
    os.makedirs(f'models/{model_name}', exist_ok=True)

    # Metrics DataFrame
    metrics_df = pd.DataFrame({
        'train_loss': [train_loss],
        'test_loss': [test_loss],
        'train_accuracy': [train_accuracy],
        'test_accuracy': [test_accuracy]
    })

    # Reverse mapping for human-readable sub-region names
    reverse_mapping = {v: k for k, v in subregion_mapping.items()}

    # Single best-label predictions
    train_df['model_prediction'] = [reverse_mapping[pred] for pred in train_predictions]
    test_df['model_prediction'] = [reverse_mapping[pred] for pred in test_predictions]

    # Add a probability column for each sub-region (e.g., "prob_<sub-region>")
    for class_idx, class_name in reverse_mapping.items():
        col_name_train = f'prob_{class_name}'
        col_name_test = f'prob_{class_name}'

        # For each row in train_probabilities, store the probability for class_idx
        train_df[col_name_train] = [row[class_idx] for row in train_probabilities]
        # For test
        test_df[col_name_test] = [row[class_idx] for row in test_probabilities]

    # Save model weights if required
    if save_weights:
        torch.save(model.state_dict(), f'models/{model_name}/model.pth')

    # Save metrics and predictions to CSV
    metrics_df.to_csv(f"models/{model_name}/metrics.csv", index=False)
    train_df.to_csv(f"models/{model_name}/train_predictions.csv", index=False)
    test_df.to_csv(f"models/{model_name}/test_predictions.csv", index=False)

####################################
# 6) TRAIN LOOP 
####################################
def train_loop(
    root_dir,
    num_epochs=2,
    batch_size=2,
    learning_rate=0.001,
    weight_decay=0.0001,
    limit=None,
    resnet_=resnet18,
    weights_=ResNet18_Weights,
    freeze = False,
    augmented_data = False,
):
    # Prompt user for model name
    model_name = input("Give model name: ")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Data transformations
    weights = weights_.DEFAULT
    transform = weights.transforms()

    # Dataset
    train_set = CustomDataset(
        csv_file="train.csv",
        root_dir=root_dir,
        transform=transform,
        limit=limit,
        augmented_data=augmented_data
    )
    test_set = CustomDataset(
        csv_file="test.csv",
        root_dir=root_dir,
        transform=transform,
        limit=limit
    )


    # # Split indices for train/test
    # train_idx, test_idx = train_test_split(
    #     range(len(dataset)), test_size=0.2, random_state=42
    # )
    # train_set = Subset(dataset, train_idx)
    # test_set = Subset(dataset, test_idx)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        test_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        collate_fn=collate_fn
    )

    # train/test DataFrames
    train_df = train_set.data.reset_index(drop=True)
    test_df = test_set.data.reset_index(drop=True)

    print(f"Training dataset size: {len(train_set)}")
    print(f"Testing dataset size: {len(test_set)}")

    # Log missing files if any - this may be no longer necessary now that the file is fixed. 
    if train_set.missing_files:
        print(f"Missing files: {len(train_set.missing_files)}")
        with open('missing_files.log', 'w') as f:
            for file in train_set.missing_files:
                f.write(f"{file}\n")

    if test_set.missing_files:
        print(f"Missing files: {len(test_set.missing_files)}")
        with open('missing_files.log', 'w') as f:
            for file in test_set.missing_files:
                f.write(f"{file}\n")

    # Load chosen ResNet
    resnet = resnet_(weights=weights_.DEFAULT)

    # Freeze all layers except the last
    if(freeze == True):
        for param in resnet.parameters():
            param.requires_grad = False

    
    # Modify the final FC layer
    num_features = resnet.fc.in_features
    resnet.fc = nn.Linear(num_features, len(train_set.subregion_mapping))
    resnet.to(device)

    # Loss function & Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(resnet.fc.parameters(), lr=learning_rate, weight_decay=weight_decay)

    training_times = []

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        start_time = time.time()

        # Train step (returns probabilities)
        train_loss, train_accuracy, train_predictions, train_probabilities = train(
            resnet, train_loader, optimizer, criterion, device
        )
        # Test step (returns probabilities)
        test_loss, test_accuracy, test_predictions, test_probabilities = test(
            resnet, test_loader, criterion, device
        )

        epoch_time = time.time() - start_time
        training_times.append({'epoch': epoch + 1, 'training_time': epoch_time})

        # Save everything for this epoch
        save_epoch(
            model_name, 
            resnet,
            train_loss, 
            test_loss,
            train_accuracy, 
            test_accuracy,
            train_predictions,
            test_predictions,
            train_probabilities,   
            test_probabilities,    
            train_df, 
            test_df, 
            train_set.subregion_mapping,
            save_weights=(epoch == num_epochs - 1)  # only save on last epoch
        )

        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
        print(f"Epoch {epoch + 1} training time: {epoch_time:.2f} seconds")

    # Save training times
    training_times_df = pd.DataFrame(training_times)
    training_times_df.to_csv(f"models/{model_name}/training_times.csv", index=False)

    print(f"\nTraining complete! Model and metrics saved to /models/{model_name}.")




train_loop(
    root_dir='Streetview_Image_Dataset/processed',
    num_epochs=2,
    batch_size=8,  # for example
    resnet_=resnet18, 
    weights_=ResNet18_Weights,
    limit=120,
    freeze=False,
    augmented_data=False,
)

Using device: cuda
Dataset initialized with 120 samples.
Sub-regions mapped: {'Southern Europe': 0, 'Sub-Saharan Africa': 1, 'South-eastern Asia': 2, 'Northern Europe': 3, 'Western Asia': 4, 'Latin America and the Caribbean': 5, 'Australia and New Zealand': 6, 'Northern America': 7, 'Eastern Europe': 8, 'Western Europe': 9, 'Southern Asia': 10, 'Eastern Asia': 11}
Dataset initialized with 120 samples.
Sub-regions mapped: {'Eastern Europe': 0, 'Latin America and the Caribbean': 1, 'Northern America': 2, 'Sub-Saharan Africa': 3, 'Southern Europe': 4, 'South-eastern Asia': 5, 'Eastern Asia': 6, 'Western Europe': 7, 'Western Asia': 8, 'Southern Asia': 9, 'Australia and New Zealand': 10, 'Northern Europe': 11}
Training dataset size: 120
Testing dataset size: 120

Epoch 1/2


Training: 100%|██████████| 15/15 [00:01<00:00, 10.21batch/s, loss=2.66]
Testing: 100%|██████████| 15/15 [00:01<00:00, 12.71batch/s, loss=3.02]


ValueError: Length of values (114) does not match length of index (120)

In [5]:
df = pd.read_csv("coords_processed.csv")


In [6]:
df["sub-region"].value_counts()

Latin America and the Caribbean    4955
Northern America                   3141
Eastern Europe                     2949
Northern Europe                    2181
Western Europe                     2026
Sub-Saharan Africa                 1946
Southern Europe                    1678
Australia and New Zealand          1658
South-eastern Asia                 1406
Eastern Asia                       1290
Western Asia                       1004
Southern Asia                       886
Name: sub-region, dtype: int64