In [1]:
import pandas as pd
from tqdm import tqdm
import reverse_geocoder as rg
import pycountry
import pycountry_convert as pc
from concurrent.futures import ThreadPoolExecutor

In [6]:
# ---------------------------
# 1. Load the CSV and set up columns
# ---------------------------
csv_path = "Streetview_Image_Dataset/coordinates.csv"
column_names = ["latitude", "longitude"]
df = pd.read_csv(csv_path)

# Add 'image_name' column
df["image_name"] = (df.index + 1).astype(str) + ".png"

# ---------------------------
# 2. Helper functions for offline country/continent lookups
# ---------------------------
def latlon_to_country_code_batch(coords):
    """
    Batch process reverse geocoding.
    """
    try:
        results = rg.search(coords)  # Batch process
        return [res['cc'] for res in results]
    except Exception as e:
        print(f"Batch reverse geocode failed. Error: {e}")
        return [None] * len(coords)

def alpha2_to_country_name(alpha2):
    """
    Converts ISO alpha-2 code to the official country name.
    """
    country = pycountry.countries.get(alpha_2=alpha2)
    return country.name if country else None

def alpha2_to_continent(alpha2):
    """
    Converts an ISO alpha-2 code to the name of the continent.
    """
    try:
        continent_code = pc.country_alpha2_to_continent_code(alpha2)
        continent_map = {
            "AF": "Africa",
            "NA": "North America",
            "SA": "South America",
            "OC": "Oceania",
            "AS": "Asia",
            "EU": "Europe",
            "AN": "Antarctica"
        }
        return continent_map.get(continent_code, None)
    except:
        return None



In [None]:
# ---------------------------
# 3. Process entire dataset with parallel processing
# ---------------------------
def process_chunk(chunk):
    """
    Processes a chunk of the DataFrame, performing reverse geocoding
    and mapping country/continent information.
    """
    coords = list(zip(chunk["latitude"], chunk["longitude"]))
    chunk["country_code"] = latlon_to_country_code_batch(coords)
    chunk["country"] = chunk["country_code"].apply(alpha2_to_country_name)
    chunk["continent"] = chunk["country_code"].apply(alpha2_to_continent)
    return chunk

# Split DataFrame into manageable chunks for parallel processing
num_threads = 4
chunk_size = len(df) // num_threads
chunks = [df.iloc[i:i + chunk_size].copy() for i in range(0, len(df), chunk_size)]

# Use ThreadPoolExecutor to process chunks in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(tqdm(executor.map(process_chunk, chunks), total=len(chunks)))

# Combine all chunks into a single DataFrame
df_final = pd.concat(results, ignore_index=True)

# ---------------------------
# 4. Save the final DataFrame to a CSV file
# ---------------------------
df_final.to_csv("coords_processed_large_dataset.csv", index=False)
print("Processing complete! Results saved to 'coords_processed2.csv'.")


100%|██████████| 5/5 [00:05<00:00,  1.13s/it]

Processing complete! Results saved to 'coords_processed.csv'.



