Afonso Fonseca - 20241781
Martinho ...
...
...

In [None]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly.express as px

data = pd.read_csv("city_data.csv", sep="|", header=1)

data["City"] = (
    data["City"]
    .str.replace(".", ",", regex=False)
    .str.replace(";", ",", regex=False)
)

# Fix known incorrect entry
data.loc[data["City"] == "Greece, Athens", "City"] = "Athens, Greece"

# Create City and Country columns
data["City Only"] = data["City"].str.split(",").str[0].str.strip()
data["Country"] = data["City"].str.split(",").str[1].str.strip()

# Remove duplicate city-country pairs (ensures no repeated scraping)
data_unique = data.drop_duplicates(subset=["City Only", "Country"]).reset_index(drop=True)

print(f"Data prepared. {len(data_unique)} cities ready for scraping.")



def get_wikipedia_coordinates(city, country):
    headers = {"User-Agent": "Mozilla/5.0"}
    main_page_url = "https://en.wikipedia.org/wiki/Main_Page"

    try:
        # Request main page 
        requests.get(main_page_url, headers=headers)

        # Internal search
        query = f"{city} {country}".replace(" ", "+")
        search_url = f"https://en.wikipedia.org/w/index.php?search={query}"
        search_page = requests.get(search_url, headers=headers)
        soup_search = BeautifulSoup(search_page.text, "html.parser")

        # First search result
        result = soup_search.select_one("ul.mw-search-results li a")
        if not result:
            return None, None

        city_page_url = "https://en.wikipedia.org" + result["href"]

        # Open the city page
        city_page = requests.get(city_page_url, headers=headers)
        soup_city = BeautifulSoup(city_page.text, "html.parser")

        # Extract coordinates
        geo_tag = soup_city.find("span", {"class": "geo"})
        if geo_tag:
            lat_text, lon_text = geo_tag.text.split(";")
            return float(lat_text), float(lon_text)

        return None, None

    except Exception:
        return None, None

# Scrape all cities
print("Starting web scraping...")

latitudes = []
longitudes = []

for _, row in data_unique.iterrows():
    lat, lon = get_wikipedia_coordinates(row["City Only"], row["Country"])
    latitudes.append(lat)
    longitudes.append(lon)

# Store coordinates
data_unique["Latitude"] = latitudes
data_unique["Longitude"] = longitudes

# Remove cities where coordinates could not be found
data_map = data_unique.dropna(subset=["Latitude", "Longitude"]).copy()
print(f"Scraping complete. Successfully mapped {len(data_map)} cities.")

# Build the interactive map
fig = px.scatter_mapbox(
    data_map,
    lat="Latitude",
    lon="Longitude",
    hover_name="City Only",
    hover_data={
        "Country": True,
        "Population": True,
        "Average Monthly Salary": True,
        "Average Cost of Living": True,
        "Latitude": False,
        "Longitude": False
    },
    color="Country",
    zoom=3,
    center={"lat": 50.0, "lon": 10.0},
    height=700,
    title="Where Should I Live? - Interactive European City Map"
)

fig.update_layout(
    mapbox_style="open-street-map",
    margin={"r": 0, "t": 40, "l": 0, "b": 0}
)

fig.show()


Data prepared. 84 cities ready for scraping.
Starting web scraping...
Scraping complete. Successfully mapped 76 cities.
