In [None]:
import os
import pandas as pd
from tqdm import tqdm
import reverse_geocoder as rg  # Offline lat/lon -> nearest city + country code
import pycountry
import pycountry_convert as pc

# ---------------------------
# 1. Load the CSV and set up columns
# ---------------------------
csv_path = "archive/coords.csv"
column_names = ["latitude", "longitude"]
df = pd.read_csv(csv_path, header=None, names=column_names)

# Optional: if you have more than 10,000 rows, slice them in lumps of 5,000 each time.
# Example: first half
df_part1 = df.iloc[:5000].copy()
# second half
df_part2 = df.iloc[5000:10000].copy()

# Add 'image_name' column
df_part1["image_name"] = (df_part1.index + 1).astype(str) + ".png"
df_part2["image_name"] = (df_part2.index + 1).astype(str) + ".png"

# ---------------------------
# 2. Helper functions for offline country/continent lookups
# ---------------------------
def latlon_to_country_code(lat, lon):
    """
    Uses reverse_geocoder (offline) to get the nearest known city
    and returns the ISO alpha-2 country code (e.g. 'US', 'FR', etc.).
    """
    try:
        # reverse_geocoder.search() expects a list of (lat, lon) tuples.
        results = rg.search((lat, lon))  # returns a list of dicts
        if results:
            # 'cc' is the 2-letter country code
            return results[0]['cc']
    except Exception as e:
        print(f"Offline reverse geocode failed for lat={lat}, lon={lon}. Error: {e}")
    return None

def alpha2_to_country_name(alpha2):
    """Converts ISO alpha-2 code to official country name."""
    if not alpha2:
        return None
    country = pycountry.countries.get(alpha_2=alpha2)
    return country.name if country else None

def alpha2_to_continent(alpha2):
    """
    Converts an ISO alpha-2 code to the name of the continent
    using pycountry_convert.
    """
    if not alpha2:
        return None
    try:
        continent_code = pc.country_alpha2_to_continent_code(alpha2)
        continent_map = {
            "AF": "Africa",
            "NA": "North America",
            "SA": "South America",
            "OC": "Oceania",
            "AS": "Asia",
            "EU": "Europe",
            "AN": "Antarctica"
        }
        return continent_map.get(continent_code, None)
    except:
        return None

# ---------------------------
# 3. Process part1 (rows 0–4999) with progress
# ---------------------------
# print("\nProcessing PART 1 (0–4999)...")

# # We'll create new columns in df_part1:
# df_part1["country_code"] = None
# df_part1["country"] = None
# df_part1["continent"] = None

# for idx in tqdm(df_part1.index, total=len(df_part1), desc="Part1"):
#     lat = df_part1.at[idx, "latitude"]
#     lon = df_part1.at[idx, "longitude"]
    
#     # country_code
#     cc = latlon_to_country_code(lat, lon)
#     df_part1.at[idx, "country_code"] = cc
    
#     # country
#     df_part1.at[idx, "country"] = alpha2_to_country_name(cc)
    
#     # continent
#     df_part1.at[idx, "continent"] = alpha2_to_continent(cc)

# ---------------------------
# 4. Process part2 (rows 5000–9999) with progress
# ---------------------------
print("\nProcessing PART 2 (5000–9999)...")

df_part2["country_code"] = None
df_part2["country"] = None
df_part2["continent"] = None

for idx in tqdm(df_part2.index, total=len(df_part2), desc="Part2"):
    lat = df_part2.at[idx, "latitude"]
    lon = df_part2.at[idx, "longitude"]
    
    # country_code
    cc = latlon_to_country_code(lat, lon)
    df_part2.at[idx, "country_code"] = cc
    
    # country
    df_part2.at[idx, "country"] = alpha2_to_country_name(cc)
    
    # continent
    df_part2.at[idx, "continent"] = alpha2_to_continent(cc)

# ---------------------------
# 5. Combine both parts
# # ---------------------------
# df_final = pd.concat([df_part1, df_part2], ignore_index=True)

# If you have more than 10,000 rows, you can keep slicing and do df_part3, df_part4, etc.

# ---------------------------
# # 6. Count how many images are in each country and continent
# # ---------------------------
# print("\nCount by country:\n", df_final["country"].value_counts(dropna=False))
# print("\nCount by continent:\n", df_final["continent"].value_counts(dropna=False))

# # ---------------------------
# # 7. (Optional) Check if the images exist in /archive/dataset
# # ---------------------------
# dataset_folder = "/archive/dataset"
# all_images = os.listdir(dataset_folder)
# all_pngs = [f for f in all_images if f.endswith(".png")]

# df_final["exists_in_archive"] = df_final["image_name"].apply(lambda x: x in all_pngs)

# print("\nImages that exist in the archive:\n", df_final["exists_in_archive"].value_counts())

# If you want to save the final results to a new CSV:
#df_final.to_csv("/archive/coords_processed.csv", index=False)
df_part2.to_csv("DF1.csv")