In [16]:
import pandas as pd
from tqdm import tqdm
import reverse_geocoder as rg
import pycountry
import pycountry_convert as pc
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import random
from torchvision import io
from torchvision.transforms import v2
import os
from multiprocessing import Pool
from functools import partial
import uuid

In [2]:
SOURCE = 'Streetview_Image_Dataset/'
DATA_PATH = SOURCE + 'raw/'

CSV_NAME = 'coordinates.csv'
OUTPUT_PATH = SOURCE + 'processed/'

In [3]:
# column_names = ["latitude", "longitude"]
df = pd.read_csv(DATA_PATH + CSV_NAME,)
df['image_name'] = df.index.astype('str') + '.png'
df.head()

Unnamed: 0,latitude,longitude,image_name
0,20.824885,-98.499517,0.png
1,-3.451752,-54.563937,1.png
2,-23.496464,-47.460542,2.png
3,-16.548678,-72.852778,3.png
4,-35.01087,140.064397,4.png


In [4]:
# ---------------------------
# 1. Helper functions for offline country/continent lookups
# ---------------------------
def latlon_to_country_code_batch(coords):
    """
    Batch process reverse geocoding.
    """
    try:
        results = rg.search(coords)  # Batch process
        return [res['cc'] for res in results]
    except Exception as e:
        print(f"Batch reverse geocode failed. Error: {e}")
        return [None] * len(coords)

def alpha2_to_country_name(alpha2):
    """
    Converts ISO alpha-2 code to the official country name.
    """
    country = pycountry.countries.get(alpha_2=alpha2)
    return country.name if country else None

def alpha2_to_continent(alpha2):
    """
    Converts an ISO alpha-2 code to the name of the continent.
    """
    try:
        continent_code = pc.country_alpha2_to_continent_code(alpha2)
        continent_map = {
            "AF": "Africa",
            "NA": "North America",
            "SA": "South America",
            "OC": "Oceania",
            "AS": "Asia",
            "EU": "Europe",
            "AN": "Antarctica"
        }
        return continent_map.get(continent_code, None)
    except:
        return None
    

# ---------------------------
# 2. Process entire dataset with parallel processing
# ---------------------------
def process_chunk(chunk):
    """
    Processes a chunk of the DataFrame, performing reverse geocoding
    and mapping country/continent information.
    """
    coords = list(zip(chunk["latitude"], chunk["longitude"]))
    # Perform batch reverse geocoding
    country_codes = latlon_to_country_code_batch(coords)
    chunk["country_code"] = country_codes
    # Convert country codes to country names
    chunk["country"] = [alpha2_to_country_name(cc) for cc in country_codes]
    # Convert country codes to continents
    chunk["continent"] = [alpha2_to_continent(cc) for cc in country_codes]
    chunk["region"] = chunk["country"]
    return chunk

In [5]:
num_threads = 4
chunk_size = max(1, len(df) // num_threads)  # Ensure chunk_size is at least 1
chunks = [df.iloc[i:i + chunk_size].copy() for i in range(0, len(df), chunk_size)]

# Process chunks in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(process_chunk, chunks))
    
df = pd.concat(results, ignore_index=True)

all_df = pd.read_csv('all.csv')

df = df.merge(all_df[['alpha-2', 'sub-region']], left_on='country_code', right_on='alpha-2', how='left')

Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...
Loading formatted geocoded file...


In [6]:
df.head()

Unnamed: 0,latitude,longitude,image_name,country_code,country,continent,region,alpha-2,sub-region
0,20.824885,-98.499517,0.png,MX,Mexico,North America,Mexico,MX,Latin America and the Caribbean
1,-3.451752,-54.563937,1.png,BR,Brazil,South America,Brazil,BR,Latin America and the Caribbean
2,-23.496464,-47.460542,2.png,BR,Brazil,South America,Brazil,BR,Latin America and the Caribbean
3,-16.548678,-72.852778,3.png,PE,Peru,South America,Peru,PE,Latin America and the Caribbean
4,-35.01087,140.064397,4.png,AU,Australia,Oceania,Australia,AU,Australia and New Zealand


In [7]:
region_counts = df.groupby('sub-region').size().sort_values()
region_counts

sub-region
Melanesia                             3
Central Asia                         41
Northern Africa                      44
Southern Asia                       886
Western Asia                       1004
Eastern Asia                       1290
South-eastern Asia                 1406
Australia and New Zealand          1658
Southern Europe                    1678
Sub-Saharan Africa                 1946
Western Europe                     2026
Northern Europe                    2181
Eastern Europe                     2949
Northern America                   3141
Latin America and the Caribbean    4955
dtype: int64

In [8]:
drop_regions = region_counts[region_counts < 500].index
df = df[~df['sub-region'].isin(drop_regions)]

In [9]:
# train test split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [20]:
def write_image(image, image_name, image_path=OUTPUT_PATH):
    """Writes the image to disk."""
    io.write_png(image, image_path + image_name)

def augment_data(image_name, input_image_path=DATA_PATH):
    """Applies augmentation transformations to an image."""
    image = io.read_image(input_image_path + image_name)
    
    transform = v2.Compose([
        v2.RandomHorizontalFlip(p=0.5),
        v2.RandomRotation(degrees=15),
        v2.ColorJitter(brightness=(0.85, 1.15)),
    ])
    return transform(image)

def batch_augment_data(image_names, input_image_path=DATA_PATH):
    """Augments a batch of images."""
    return [augment_data(image_name, input_image_path) for image_name in image_names]

def balance_classes(df):
    """Balances the classes by augmenting underrepresented samples."""
    class_counts = df['sub-region'].value_counts()
    max_count = class_counts.max()
    skip_classes = ['Latin America and the Caribbean', 'Northern America', 'Eastern Europe', 'Northern Europe', 'Western Europe', 'Sub-Saharan Africa', 'Australia and New Zealand', 'Southern Europe', 'South-eastern Asia', 'Eastern Asia']

    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)
    else:
        # Clear the directory
        [os.remove(OUTPUT_PATH + filename) for filename in os.listdir(OUTPUT_PATH)]

    for cls, count in class_counts.items():
        if cls in skip_classes:
            continue
        print(f"Handling class {cls}")
        num_samples_needed = max_count - count
        class_df = df[df['sub-region'] == cls]

        if num_samples_needed > 0:  # Augment minority class
            print(f"Augmenting {num_samples_needed} images for class {cls}")
            resampled_df = class_df.sample(n=num_samples_needed, replace=True)
            augmented_images = batch_augment_data(resampled_df['image_name'].tolist())
            resampled_df['image_name'] = 'augmented_'+ uuid.uuid4().hex[:8] + resampled_df['image_name']
            
            # Create new rows
            for i, (image_name, augmented_image) in enumerate(zip(resampled_df['image_name'], augmented_images)):
                write_image(augmented_image, image_name)
            df = pd.concat([df, resampled_df], ignore_index=True)
        else:  # Augment majority class for diversity
            sample_size = len(class_df) // 4
            print(f"Augmenting {sample_size} images for class {cls}")
            sampled_df = class_df.sample(n=sample_size)
            augmented_images = batch_augment_data(sampled_df['image_name'].tolist())
            sampled_df['image_name'] = 'augmented_' + uuid.uuid4().hex[:8] + sampled_df['image_name']
            df.loc[sampled_df.index] = sampled_df
            # Update rows
            for i, (image_name, augmented_image) in enumerate(zip(sampled_df['image_name'], augmented_images)):
                write_image(augmented_image, image_name)

    return df


In [None]:
df_train['is_augmented'] = False
df_train['aumentation_source_image_name'] = df_train['image_name']
df_train = balance_classes(df_train)
for image_name in df_train[~df_train['is_augmented']]['image_name']:
    image = io.read_image(DATA_PATH + image_name)
    write_image(image, OUTPUT_PATH + image_name)
df_train.head()

Handling class Western Asia
Augmenting 3162 images for class Western Asia


In [None]:
df_train.to_csv(OUTPUT_PATH + 'train.csv', index=False)
df_test.to_csv(OUTPUT_PATH + 'test.csv', index=False)