In [8]:
import pandas as pd
from tqdm import tqdm
import reverse_geocoder as rg
import pycountry
import pycountry_convert as pc
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import random
from torchvision import io
from torchvision.transforms import v2
import os

In [9]:
SOURCE = 'Streetview_Image_Dataset/'
DATA_PATH = SOURCE + 'raw/'

CSV_NAME = 'coordinates.csv'
OUTPUT_PATH = SOURCE + 'processed/'

In [10]:
# column_names = ["latitude", "longitude"]
df = pd.read_csv(DATA_PATH + CSV_NAME,)
df['image_name'] = df.index.astype('str') + '.png'
df.head()

Unnamed: 0,latitude,longitude,image_name
0,20.824885,-98.499517,0.png
1,-3.451752,-54.563937,1.png
2,-23.496464,-47.460542,2.png
3,-16.548678,-72.852778,3.png
4,-35.01087,140.064397,4.png


In [13]:
# ---------------------------
# 1. Helper functions for offline country/continent lookups
# ---------------------------
def latlon_to_country_code_batch(coords):
    """
    Batch process reverse geocoding.
    """
    try:
        results = rg.search(coords)  # Batch process
        return [res['cc'] for res in results]
    except Exception as e:
        print(f"Batch reverse geocode failed. Error: {e}")
        return [None] * len(coords)

def alpha2_to_country_name(alpha2):
    """
    Converts ISO alpha-2 code to the official country name.
    """
    country = pycountry.countries.get(alpha_2=alpha2)
    return country.name if country else None

def alpha2_to_continent(alpha2):
    """
    Converts an ISO alpha-2 code to the name of the continent.
    """
    try:
        continent_code = pc.country_alpha2_to_continent_code(alpha2)
        continent_map = {
            "AF": "Africa",
            "NA": "North America",
            "SA": "South America",
            "OC": "Oceania",
            "AS": "Asia",
            "EU": "Europe",
            "AN": "Antarctica"
        }
        return continent_map.get(continent_code, None)
    except:
        return None
    

# ---------------------------
# 2. Process entire dataset with parallel processing
# ---------------------------
def process_chunk(chunk):
    """
    Processes a chunk of the DataFrame, performing reverse geocoding
    and mapping country/continent information.
    """
    coords = list(zip(chunk["latitude"], chunk["longitude"]))
    # Perform batch reverse geocoding
    country_codes = latlon_to_country_code_batch(coords)
    chunk["country_code"] = country_codes
    # Convert country codes to country names
    chunk["country"] = [alpha2_to_country_name(cc) for cc in country_codes]
    # Convert country codes to continents
    chunk["continent"] = [alpha2_to_continent(cc) for cc in country_codes]
    chunk["region"] = chunk["country"]
    return chunk

In [14]:
num_threads = 4
chunk_size = max(1, len(df) // num_threads)  # Ensure chunk_size is at least 1
chunks = [df.iloc[i:i + chunk_size].copy() for i in range(0, len(df), chunk_size)]

# Process chunks in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(process_chunk, chunks))
    
df = pd.concat(results, ignore_index=True)

all_df = pd.read_csv('all.csv')

df = df.merge(all_df[['alpha-2', 'sub-region']], left_on='country_code', right_on='alpha-2', how='left')

Loading formatted geocoded file...Loading formatted geocoded file...
Loading formatted geocoded file...

Loading formatted geocoded file...


In [15]:
df.head()

Unnamed: 0,latitude,longitude,image_name,country_code,country,continent,region,alpha-2,sub-region
0,20.824885,-98.499517,0.png,MX,Mexico,North America,Mexico,MX,Latin America and the Caribbean
1,-3.451752,-54.563937,1.png,BR,Brazil,South America,Brazil,BR,Latin America and the Caribbean
2,-23.496464,-47.460542,2.png,BR,Brazil,South America,Brazil,BR,Latin America and the Caribbean
3,-16.548678,-72.852778,3.png,PE,Peru,South America,Peru,PE,Latin America and the Caribbean
4,-35.01087,140.064397,4.png,AU,Australia,Oceania,Australia,AU,Australia and New Zealand


In [16]:
region_counts = df.groupby('sub-region').size().sort_values()
region_counts

sub-region
Melanesia                             3
Central Asia                         41
Northern Africa                      44
Southern Asia                       886
Western Asia                       1004
Eastern Asia                       1290
South-eastern Asia                 1406
Australia and New Zealand          1658
Southern Europe                    1678
Sub-Saharan Africa                 1946
Western Europe                     2026
Northern Europe                    2181
Eastern Europe                     2949
Northern America                   3141
Latin America and the Caribbean    4955
dtype: int64

In [17]:
drop_regions = region_counts[region_counts < 500].index
df = df[~df['sub-region'].isin(drop_regions)]

In [18]:
# train test split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [30]:
# augment the train set to balance the classes
def write_image(image, image_name, image_path=OUTPUT_PATH):
    io.write_png(image, OUTPUT_PATH + image_name)

def augment_data(image_name, input_image_path=DATA_PATH, output_image_path=OUTPUT_PATH):
    image = io.read_image(input_image_path + image_name)
    
    transform = v2.Compose([
        v2.RandomHorizontalFlip(p=0.5),
        v2.RandomRotation(degrees=30),
        v2.ColorJitter(brightness=(0, 0.15), contrast=(0,0.15), saturation=(0, 0.15), hue=(0, 0.15)),
    ])
    
    # apply augmentation to the image
    augmented_image = transform(image)
    # Placeholder for augmentation logic
    return augmented_image


def balance_classes(df):
    class_counts = df['sub-region'].value_counts()
    
    # Count the number of instances for each class
    max_count = class_counts.max()
    # whole directory of images:
    
    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)
    else:    
        for filename in os.listdir(OUTPUT_PATH):
            os.remove(OUTPUT_PATH + filename)
    
    
    for cls, count in class_counts.items():
        print(f'handling class {cls}')
        if count < max_count:
            # Number of samples needed to balance the class
            num_samples_needed = max_count - count
            df_minority = df[df['sub-region'] == cls]
            
            # Resample with replacement
            df_resampled = df_minority.sample(n=num_samples_needed, replace=True)
            print(f'Augmenting {num_samples_needed} images for class {cls}')
            augmented_images = df_resampled['image_name'].apply(augment_data)
            df_resampled['is_augmented'] = True
            df_resampled['aumentation_source_image_name'] = df_resampled['image_name']
            df_resampled['image_name'] = [str(len(df) + i) + '.png' for i in range(1, num_samples_needed + 1)]
            
            df = pd.concat([df, df_resampled], ignore_index=True)
            
            # write the augmented images to disk
            print(f'Writing {num_samples_needed} augmented images to disk')
            for i, image_name in enumerate(df_resampled['image_name']):
                write_image(augmented_images[i], image_name)
            
        else:
            
            # Augment some images for classes that already have max_count
            df_majority = df[df['sub-region'] == cls]
            
            # sample 25% of the indices
            sample_indices = df_majority.sample(n=len(df_majority) // 4, replace=False).index
            df_majority.loc[sample_indices, 'is_augmented'] = True
            df_majority.loc[sample_indices, 'augmentation_source_image_name'] = df_majority.loc[sample_indices, 'image_name']
            print(f'Augmenting {len(sample_indices)} images for class {cls}')
            augmented_images = df_majority['image_name'].apply(augment_data)
            df.loc[sample_indices] = df_majority

            # Write the augmented images to disk
            print(f'Writing {len(sample_indices)} augmented images to disk')
            for i, image_name in enumerate(df_majority['image_name']):
                write_image(augmented_images[i], image_name)
    return df



In [31]:
df_train['is_augmented'] = False
df_train['aumentation_source_image_name'] = df_train['image_name']
df_train = balance_classes(df_train)
df_train.head()

handling class Latin America and the Caribbean
Augmenting 993 images for class Latin America and the Caribbean


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_majority.loc[sample_indices, 'augmentation_source_image_name'] = df_majority.loc[sample_indices, 'image_name']


KeyboardInterrupt: 

In [None]:
import pandas as pd

# Load the datasets
coords_processed_file = 'coords_processed_large_dataset.csv'
all_file = 'all.csv'

# coords_df = pd.read_csv(coords_processed_file)
all_df = pd.read_csv(all_file)

# Ensure country codes column is consistent in type
coords_df['country_code'] = df['country_code'].astype(str).str.strip()
all_df['alpha-2'] = all_df['alpha-2'].astype(str).str.strip()

# Merge the dataframes based on country codes
merged_df = coords_df.merge(all_df[['alpha-2', 'sub-region']], left_on='country_code', right_on='alpha-2', how='left')



FileNotFoundError: [Errno 2] No such file or directory: 'coords_processed_large_dataset.csv'