In [1]:
import pandas as pd
from tqdm import tqdm
import reverse_geocoder as rg
import pycountry
import pycountry_convert as pc
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import random
from torchvision import io
from torchvision.transforms import v2
import os
from multiprocessing import Pool
from functools import partial
import uuid

In [4]:
SOURCE = 'Streetview_Image_Dataset/'
DATA_PATH = SOURCE 

CSV_NAME = 'coordinates.csv'
OUTPUT_PATH = SOURCE + 'processed/'

In [5]:
# column_names = ["latitude", "longitude"]
df = pd.read_csv(DATA_PATH + CSV_NAME,)
df['image_name'] = df.index.astype('str') + '.png'
df.head()

Unnamed: 0,latitude,longitude,image_name
0,20.824885,-98.499517,0.png
1,-3.451752,-54.563937,1.png
2,-23.496464,-47.460542,2.png
3,-16.548678,-72.852778,3.png
4,-35.01087,140.064397,4.png


In [7]:
# ---------------------------
# 1. Helper functions for offline country/continent lookups
# ---------------------------
def latlon_to_country_code_batch(coords):
    """
    Batch process reverse geocoding.
    """
    try:
        results = rg.search(coords)  # Batch process
        return [res['cc'] for res in results]
    except Exception as e:
        print(f"Batch reverse geocode failed. Error: {e}")
        return [None] * len(coords)

def alpha2_to_country_name(alpha2):
    """
    Converts ISO alpha-2 code to the official country name.
    """
    country = pycountry.countries.get(alpha_2=alpha2)
    return country.name if country else None

def alpha2_to_continent(alpha2):
    """
    Converts an ISO alpha-2 code to the name of the continent.
    """
    try:
        continent_code = pc.country_alpha2_to_continent_code(alpha2)
        continent_map = {
            "AF": "Africa",
            "NA": "North America",
            "SA": "South America",
            "OC": "Oceania",
            "AS": "Asia",
            "EU": "Europe",
            "AN": "Antarctica"
        }
        return continent_map.get(continent_code, None)
    except:
        return None
    

# ---------------------------
# 2. Process entire dataset with parallel processing
# ---------------------------
def process_chunk(chunk):
    """
    Processes a chunk of the DataFrame, performing reverse geocoding
    and mapping country/continent information.
    """
    coords = list(zip(chunk["latitude"], chunk["longitude"]))
    # Perform batch reverse geocoding
    country_codes = latlon_to_country_code_batch(coords)
    chunk["country_code"] = country_codes
    # Convert country codes to country names
    chunk["country"] = [alpha2_to_country_name(cc) for cc in country_codes]
    # Convert country codes to continents
    chunk["continent"] = [alpha2_to_continent(cc) for cc in country_codes]
    chunk["region"] = chunk["country"]
    return chunk

In [8]:
num_threads = 4
chunk_size = max(1, len(df) // num_threads)  # Ensure chunk_size is at least 1
chunks = [df.iloc[i:i + chunk_size].copy() for i in range(0, len(df), chunk_size)]

# Process chunks in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(process_chunk, chunks))
    
df = pd.concat(results, ignore_index=True)

all_df = pd.read_csv('all.csv')

df = df.merge(all_df[['alpha-2', 'sub-region']], left_on='country_code', right_on='alpha-2', how='left')

Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...


In [9]:
df.head()

Unnamed: 0,latitude,longitude,image_name,country_code,country,continent,region,alpha-2,sub-region
0,20.824885,-98.499517,0.png,MX,Mexico,North America,Mexico,MX,Latin America and the Caribbean
1,-3.451752,-54.563937,1.png,BR,Brazil,South America,Brazil,BR,Latin America and the Caribbean
2,-23.496464,-47.460542,2.png,BR,Brazil,South America,Brazil,BR,Latin America and the Caribbean
3,-16.548678,-72.852778,3.png,PE,Peru,South America,Peru,PE,Latin America and the Caribbean
4,-35.01087,140.064397,4.png,AU,Australia,Oceania,Australia,AU,Australia and New Zealand


In [10]:
region_counts = df.groupby('sub-region').size().sort_values()
region_counts

sub-region
Melanesia                             3
Central Asia                         41
Northern Africa                      44
Southern Asia                       886
Western Asia                       1004
Eastern Asia                       1290
South-eastern Asia                 1406
Australia and New Zealand          1658
Southern Europe                    1678
Sub-Saharan Africa                 1946
Western Europe                     2026
Northern Europe                    2181
Eastern Europe                     2949
Northern America                   3141
Latin America and the Caribbean    4955
dtype: int64

In [11]:
drop_regions = region_counts[region_counts < 500].index
df = df[~df['sub-region'].isin(drop_regions)]

In [12]:
# train test split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [27]:
import os
import uuid

import torch
import pandas as pd
from tqdm import tqdm
from torchvision import io, transforms as T

# If you have not defined these paths already, you need to define them.
# For example:
OUTPUT_PATH = "augmented_output/"
DATA_PATH = "Streetview_Image_Dataset/raw/"

# Detect if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def write_image(image_tensor, image_name, image_path=OUTPUT_PATH):
    """
    Writes a PyTorch tensor image to disk as PNG.
    Assumes `image_tensor` is in [C, H, W] format on the CPU.
    """
    # Make sure the output directory exists.
    if not os.path.exists(image_path):
        os.makedirs(image_path, exist_ok=True)

    # Use torchvision.io.write_png to save the tensor as a PNG file.
    io.write_png(image_tensor, os.path.join(image_path, image_name))


def augment_data(image_name, input_image_path=DATA_PATH, device=device):
    # 1. Read the image as a torch tensor [C, H, W] in [0, 255] (uint8).
    image_path = os.path.join(input_image_path, image_name)
    image = io.read_image(image_path)  # shape: [C, H, W], dtype=uint8, range=[0,255]

    # 2. Convert to float in [0,1] and move to GPU (if available).
    image = image.float() / 255.0  # now in range [0,1]
    image = image.to(device=device)

    # 3. Define your transformations (which assume [0,1] floating points).
    transform = T.Compose([
        T.RandomHorizontalFlip(p=0.5),
        T.RandomRotation(degrees=15),
        T.ColorJitter(brightness=(0.85, 1.15)),
    ])

    # 4. Apply transformations.
    augmented_image = transform(image)

    # 5. Scale back to [0,255], convert to uint8, move back to CPU.
    augmented_image = (augmented_image * 255.0).clamp(0, 255).to(torch.uint8).cpu()

    return augmented_image


def batch_augment_data(image_names, input_image_path=DATA_PATH, device=device):
    """
    Augments a batch of images, returning a list of augmented tensors.
    Includes a progress bar for the batch process.
    """
    augmented_images = []
    for name in tqdm(image_names, desc="Batch Augmenting"):
        augmented_images.append(augment_data(name, input_image_path, device=device))
    return augmented_images


def balance_classes(df):
    """
    Balances the classes by augmenting underrepresented samples (minority classes)
    or augmenting some subset of overrepresented classes (majority classes) for diversity.
    Includes a progress bar for class balancing.
    """
    class_counts = df['sub-region'].value_counts()
    max_count = class_counts.max()

    skip_classes = [
        'Western Asia', 'Southern Asia',  'Northern Europe','Eastern Europe','Northern America','Latin America and the Caribbean'
    ]

   

    # Show progress over the classes
    for cls, count in tqdm(class_counts.items(), desc="Balancing Classes"):
        if cls in skip_classes:
            continue

        print(f"\nHandling class: {cls} (current count: {count})")
        num_samples_needed = max_count - count
        class_df = df[df['sub-region'] == cls]

        if num_samples_needed > 0:
            # Augment minority class (underrepresented)
            print(f" -> Augmenting {num_samples_needed} images for class '{cls}'")
            resampled_df = class_df.sample(n=num_samples_needed, replace=True)

            # Perform the actual augmentations
            augmented_images = batch_augment_data(resampled_df['image_name'].tolist())

            # Give new names to the augmented images
            # Example: "augmented_af12b345original.png"
            new_names = [
                f"augmented_{uuid.uuid4().hex[:8]}_{img_name}"
                for img_name in resampled_df['image_name']
            ]
            resampled_df['image_name'] = new_names

            # Write the images to disk and add rows to df
            for (image_name, augmented_image) in zip(resampled_df['image_name'], augmented_images):
                write_image(augmented_image, image_name)

            # Concatenate the newly augmented rows to the original dataframe
            df = pd.concat([df, resampled_df], ignore_index=True)

        else:
            # Augment some samples from the majority class for diversity
            # For example, take 25% of them to re-augment
            sample_size = len(class_df) // 4
            print(f" -> Augmenting {sample_size} images for class '{cls}'")
            sampled_df = class_df.sample(n=sample_size)

            # Perform the actual augmentations
            augmented_images = batch_augment_data(sampled_df['image_name'].tolist())

            # Generate new names (or you can overwrite if you prefer)
            new_names = [
                f"augmented_{uuid.uuid4().hex[:8]}_{img_name}"
                for img_name in sampled_df['image_name']
            ]
            sampled_df['image_name'] = new_names

            # Write the images to disk and update the existing rows in df
            for index, augmented_image in zip(sampled_df.index, augmented_images):
                write_image(augmented_image, sampled_df.loc[index, 'image_name'])
            
            # Update the dataframe in-place with the new image names
            df.loc[sampled_df.index] = sampled_df

    print("\nBalancing done!")
    return df


Using device: cuda


In [28]:
df_train['is_augmented'] = False
df_train['aumentation_source_image_name'] = df_train['image_name']
df_train = balance_classes(df_train)
for image_name in df_train[~df_train['is_augmented']]['image_name']:
    image = io.read_image(DATA_PATH + image_name)
    write_image(image, OUTPUT_PATH + image_name)
df_train.head()

Balancing Classes: 0it [00:00, ?it/s]


Handling class: Western Europe (current count: 1604)
 -> Augmenting 2368 images for class 'Western Europe'


Batch Augmenting: 100%|██████████| 2368/2368 [00:19<00:00, 123.15it/s]
Balancing Classes: 7it [04:18, 36.91s/it]


Handling class: Sub-Saharan Africa (current count: 1539)
 -> Augmenting 2433 images for class 'Sub-Saharan Africa'


Batch Augmenting: 100%|██████████| 2433/2433 [00:16<00:00, 145.13it/s]
Balancing Classes: 8it [08:19, 71.54s/it]


Handling class: Australia and New Zealand (current count: 1330)
 -> Augmenting 2642 images for class 'Australia and New Zealand'


Batch Augmenting: 100%|██████████| 2642/2642 [00:20<00:00, 130.40it/s]
Balancing Classes: 9it [13:25, 117.27s/it]


Handling class: Southern Europe (current count: 1328)
 -> Augmenting 2644 images for class 'Southern Europe'


Batch Augmenting: 100%|██████████| 2644/2644 [00:22<00:00, 118.43it/s]
Balancing Classes: 10it [17:57, 150.88s/it]


Handling class: South-eastern Asia (current count: 1162)
 -> Augmenting 2810 images for class 'South-eastern Asia'


Batch Augmenting: 100%|██████████| 2810/2810 [00:22<00:00, 124.15it/s]
Balancing Classes: 11it [22:33, 180.67s/it]


Handling class: Eastern Asia (current count: 1056)
 -> Augmenting 2916 images for class 'Eastern Asia'


Batch Augmenting: 100%|██████████| 2916/2916 [00:23<00:00, 123.99it/s]
Balancing Classes: 12it [27:42, 138.55s/it]



Balancing done!


RuntimeError: Error opening output file

In [22]:
df_train

Unnamed: 0,latitude,longitude,image_name,country_code,country,continent,region,alpha-2,sub-region,is_augmented,aumentation_source_image_name
0,45.603220,15.538784,3388.png,HR,Croatia,Europe,Croatia,HR,Southern Europe,False,3388.png
1,7.382881,3.673380,2963.png,NG,Nigeria,Africa,Nigeria,NG,Sub-Saharan Africa,False,2963.png
2,1.400360,103.894100,14138.png,MY,Malaysia,Asia,Malaysia,MY,South-eastern Asia,False,14138.png
3,53.949390,-7.846810,23442.png,IE,Ireland,Europe,Ireland,IE,Northern Europe,False,23442.png
4,32.186400,34.864500,20450.png,IL,Israel,Asia,Israel,IL,Western Asia,False,20450.png
...,...,...,...,...,...,...,...,...,...,...,...
26558,24.664557,90.457702,augmented_f90d1b8c_6236.png,BD,Bangladesh,Asia,Bangladesh,BD,Southern Asia,False,6236.png
26559,6.904470,79.897070,augmented_eddb7422_23318.png,LK,Sri Lanka,Asia,Sri Lanka,LK,Southern Asia,False,23318.png
26560,23.528160,89.142910,augmented_f8809b08_11914.png,BD,Bangladesh,Asia,Bangladesh,BD,Southern Asia,False,11914.png
26561,24.726360,74.052621,augmented_c746e873_3163.png,IN,India,Asia,India,IN,Southern Asia,False,3163.png


In [29]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)