In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import time

# --- STEP 1: LOAD AND PREPROCESS DATA ---

# Load data (using the separator and header from your notebook)
data = pd.read_csv("city_data.csv", sep="|", header=1)

# Fix specific data formatting issues before splitting
# Fixing "Berlin. Germany" -> "Berlin, Germany"
data['City'] = data['City'].str.replace('.', ',', regex=False)
# Fixing "Lemesos;Cyprus" -> "Lemesos, Cyprus"
data['City'] = data['City'].str.replace(';', ',', regex=False)
# Fixing "Greece, Athens" order
data.loc[data['City'] == "Greece, Athens", 'City'] = "Athens, Greece"

# Create data_new and separate City/Country columns
data_new = data.copy()
# Strip whitespace from column names just in case
data_new.columns = data_new.columns.str.strip()
data_new["City Only"] = data["City"].str.split(",").str[0].str.strip()
data_new["Country"] = data["City"].str.split(",").str[1].str.strip()

# Drop duplicate rows based on City and Country to avoid processing the same city twice
data_new = data_new.drop_duplicates(subset=['City Only', 'Country']).reset_index(drop=True)

print(f"Data prepared. {len(data_new)} cities ready for scraping.")


# --- STEP 2: WEB SCRAPING COORDINATES (With Headers) ---
# [cite_start]Goal: Extract geographical coordinates from Wikipedia [cite: 60]

def get_wikipedia_coordinates(city, country):
    """Fetches latitude and longitude from Wikipedia for a given city."""
    # Handle potential missing values
    if not isinstance(city, str) or not isinstance(country, str):
        return None, None

    # Construct URL: Wikipedia URLs usually use underscores instead of spaces
    city_url = city.replace(" ", "_")
    country_url = country.replace(" ", "_")
    
    # Define URL patterns to try. "City, Country" is most specific.
    urls_to_try = [
        f"https://en.wikipedia.org/wiki/{city_url},_{country_url}",
        f"https://en.wikipedia.org/wiki/{city_url}"
    ]
    
    # IMPORTANT: Add a User-Agent header so Wikipedia doesn't block the request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    for url in urls_to_try:
        try:
            # Add headers and timeout
            response = requests.get(url, headers=headers, timeout=5)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Wikipedia usually stores coordinates in a span with class="geo"
                # This element often contains "lat; lon" text
                geo_tag = soup.find("span", {"class": "geo"})
                if geo_tag:
                    lat_lon = geo_tag.text.split(";")
                    if len(lat_lon) == 2:
                        return float(lat_lon[0]), float(lat_lon[1])
        except Exception as e:
            continue
            
    return None, None

# Run the scraper
print("Starting Web Scraping (this may take a minute)...")
lats = []
lons = []

for index, row in data_new.iterrows():
    # Fetch coordinates
    lat, lon = get_wikipedia_coordinates(row['City Only'], row['Country'])
    lats.append(lat)
    lons.append(lon)
    
    # Optional: Add a tiny delay to be polite to the server
    # time.sleep(0.1)

# Add new coordinates to the dataframe
data_new['Latitude'] = lats
data_new['Longitude'] = lons

# Filter out cities where coordinates couldn't be found (if any)
data_map = data_new.dropna(subset=['Latitude', 'Longitude']).copy()
print(f"Scraping complete. Successfully mapped {len(data_map)} cities.")


# --- STEP 3: INTERACTIVE MAP ---
# [cite_start]Goal: Build an interactive map highlighting cities with specific info [cite: 62, 63, 64]

# We use plotly.express for the interactive map
fig = px.scatter_mapbox(
    data_map,
    lat="Latitude",
    lon="Longitude",
    hover_name="City Only",
    # [cite_start]Customizing hover data as per project requirements [cite: 65, 66, 67, 68]
    hover_data={
        "Latitude": False, # Hide internal lat/lon
        "Longitude": False,
        "Country": True,
        "Population": True,
        "Average Monthly Salary": True,
        "Average Cost of Living": True
    },
    color="Country", # Color markers by country
    zoom=3,
    center={"lat": 50.0, "lon": 10.0}, # Center map on Europe
    height=700,
    title="Where Should I Live? - European City Comparison"
)

# Apply OpenStreetMap style (no API key needed)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})

fig.show()

Data prepared. 84 cities ready for scraping.
Starting Web Scraping (this may take a minute)...
Scraping complete. Successfully mapped 84 cities.
