In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import io
from torchvision.models import resnet18, ResNet18_Weights, resnet50, resnet101, ResNet101_Weights, ResNet50_Weights
from tqdm import tqdm  # For progress bars


In [None]:
class CustomDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None, limit=None):
        self.data = pd.read_csv(csv_file)
        if limit:  # Limit the dataset to a small number of observations
            self.data = self.data.head(limit)
        self.root_dir = root_dir
        self.transform = transform
        self.subregion_mapping = {subregion: idx for idx, subregion in enumerate(self.data['sub-region'].unique())}
        self.data['subregion_label'] = self.data['sub-region'].map(self.subregion_mapping)
        self.missing_files = []  # List to log missing files

        print(f"Dataset initialized with {len(self.data)} samples.")
        print(f"Sub-regions mapped: {self.subregion_mapping}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['image_name'])
        if not os.path.exists(img_path):
            self.missing_files.append(img_path)
            return None  # Skip this sample

        image = io.read_image(img_path)
        if self.transform:
            image = self.transform(image)
        label = self.data.iloc[idx]['subregion_label']

        if idx == 0:  # Show one sample for debugging
            print(f"Sample image shape: {image.shape}, Label: {label}")

        return image, label


# Custom collate function to handle None values
def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.default_collate(batch)


# Training function
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    train_loss, correct = 0, 0
    predictions = []

    with tqdm(train_loader, desc="Training", unit="batch") as pbar:
        for batch in pbar:
            if batch is None:  # Skip if batch is empty
                continue
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()

            predictions.extend(preds.cpu().numpy())  # Store predictions

            pbar.set_postfix(loss=loss.item())

    train_accuracy = correct / len(train_loader.dataset)
    train_loss /= len(train_loader)
    return train_loss, train_accuracy, predictions


# Testing function
def test(model, test_loader, criterion, device):
    model.eval()
    test_loss, correct = 0, 0
    predictions = []

    with tqdm(test_loader, desc="Testing", unit="batch") as pbar:
        for batch in pbar:
            if batch is None:  # Skip if batch is empty
                continue

            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():  # Ensure gradients are not computed
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            test_loss += loss.item()
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()

            predictions.extend(preds.cpu().numpy())  # Store predictions

            pbar.set_postfix(loss=loss.item())

    test_accuracy = correct / len(test_loader.dataset)
    test_loss /= len(test_loader)

    return test_loss, test_accuracy, predictions

def save_epoch(model_name, model, train_loss, test_loss, train_accuracy, test_accuracy, train_predictions, test_predictions, train_df, test_df, subregion_mapping, save_weights=False):
    os.makedirs(f'models/{model_name}', exist_ok=True)

    # Corrected metrics DataFrame
    metrics_df = pd.DataFrame({
        'train_loss': [train_loss],
        'test_loss': [test_loss],
        'train_accuracy': [train_accuracy],
        'test_accuracy': [test_accuracy]
    })

    # Reverse mapping for human-readable sub-region names
    reverse_mapping = {v: k for k, v in subregion_mapping.items()}

    # Map numeric predictions to sub-region names
    train_df['model_prediction'] = [reverse_mapping[pred] for pred in train_predictions]
    test_df['model_prediction'] = [reverse_mapping[pred] for pred in test_predictions]

    # Save model weights if required
    if save_weights:
        torch.save(model.state_dict(), f'models/{model_name}/model.pth')

    # Save metrics and predictions to CSV
    metrics_df.to_csv(f"models/{model_name}/metrics.csv", index=False)
    train_df.to_csv(f"models/{model_name}/train_predictions.csv", index=False)
    test_df.to_csv(f"models/{model_name}/test_predictions.csv", index=False)


# Main training loop with adjusted DataFrame handling
def train_loop(csv_path, root_dir, num_epochs=2, batch_size=2, learning_rate=0.001, weight_decay=0.0001, limit=None, resnet_=resnet18, weights_=ResNet18_Weights ):
    model_name = input("Give model name: ")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Data transformations
    weights = weights_.DEFAULT
    transform = weights.transforms()

    # Dataset and DataLoaders
    dataset = CustomDataset(csv_file=csv_path, root_dir=root_dir, transform=transform, limit=limit)
    train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
    train_set = torch.utils.data.Subset(dataset, train_idx)
    test_set = torch.utils.data.Subset(dataset, test_idx)

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn)

    # Separate train and test DataFrames
    train_df = dataset.data.iloc[train_idx].reset_index(drop=True)
    test_df = dataset.data.iloc[test_idx].reset_index(drop=True)

    print(f"Training dataset size: {len(train_set)}")
    print(f"Testing dataset size: {len(test_set)}")

    # Log missing files
    if dataset.missing_files:
        print(f"Missing files: {len(dataset.missing_files)}")
        with open('missing_files.log', 'w') as f:
            for file in dataset.missing_files:
                f.write(f"{file}\n")

    # Load ResNet18 and modify the final layer
    resnet = resnet_(weights=weights_.DEFAULT)


#############FREEZING 
    for param in resnet.parameters():
        param.requires_grad = False  # Freeze all layers except the last
######################


    num_features = resnet.fc.in_features
    resnet.fc = nn.Linear(num_features, len(dataset.subregion_mapping))  # Number of sub-regions
    resnet.to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(resnet.fc.parameters(), lr=learning_rate, weight_decay=weight_decay)

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        train_loss, train_accuracy, train_predictions = train(resnet, train_loader, optimizer, criterion, device)
        test_loss, test_accuracy, test_predictions = test(resnet, test_loader, criterion, device)
        save_weights = epoch == num_epochs - 1

        # Save epoch results
        save_epoch(model_name, resnet, train_loss, test_loss, train_accuracy, test_accuracy,
                   train_predictions, test_predictions, train_df, test_df, dataset.subregion_mapping, save_weights)

        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    print(f"\nTraining complete! Model saved to metrics to /models/{model_name}.")


# Example usage
# Ensure 'coords_processed.csv' and 'Streetview_Image_Dataset/' are properly set up
train_loop(csv_path='coords_processed.csv', root_dir='Streetview_Image_Dataset/', num_epochs=2, batch_size=128, resnet_=resnet50, weights_=ResNet50_Weights)

Using device: cuda
Dataset initialized with 25229 samples.
Sub-regions mapped: {'Latin America and the Caribbean': 0, 'Australia and New Zealand': 1, 'Western Asia': 2, 'Sub-Saharan Africa': 3, 'Western Europe': 4, 'Southern Asia': 5, 'South-eastern Asia': 6, 'Eastern Europe': 7, 'Eastern Asia': 8, 'Northern Europe': 9, 'Southern Europe': 10, 'Northern America': 11, 'Central Asia': 12, nan: 13, 'Northern Africa': 14, 'Melanesia': 15}
Training dataset size: 20183
Testing dataset size: 5046

Epoch 1/2


Training:   8%|▊         | 12/158 [00:10<02:04,  1.17batch/s, loss=2.29]

6