In [62]:
import pandas as pd
import numpy as np
import folium
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [27]:
web_link = 'https://en.wikipedia.org/wiki/List_of_most-visited_museums'
df = pd.read_html(web_link)[0]
df.head()

Unnamed: 0,Name,Visitors in 2023 or 2022,City,Country
0,Louvre,"8,700,000 (2024)[1]",Paris,France
1,Vatican Museums,"6,800,000 (2023)[2]","Vatican City, Rome",Vatican
2,National Museum of China,"6,765,000 (2023)[3]",Beijing,China
3,British Museum,"5,820,860 (2023)[4]",London,United Kingdom
4,Natural History Museum,"5,688,608 (2023)[5]",London,United Kingdom


In [28]:
df.rename(columns={
    'Name': 'name',
    'Visitors in 2023 or 2022': 'visitors_count',
    'City': 'city',
    'Country': 'country',
}, inplace=True)

df.head()

Unnamed: 0,name,visitors_count,city,country
0,Louvre,"8,700,000 (2024)[1]",Paris,France
1,Vatican Museums,"6,800,000 (2023)[2]","Vatican City, Rome",Vatican
2,National Museum of China,"6,765,000 (2023)[3]",Beijing,China
3,British Museum,"5,820,860 (2023)[4]",London,United Kingdom
4,Natural History Museum,"5,688,608 (2023)[5]",London,United Kingdom


In [29]:
# df['visitors_count'] =
df['total_visitors'] = df['visitors_count'].str.split(r'[\(\[]', n=1).str[0]
# df['total_visitors'] = df['total_visitors'].str.replace(',', '').astype(int)
# df.head()

In [32]:
def clean_visitors(value):
    # Remove extra spaces and handle None or NaN values
    if pd.isna(value):
        return np.nan
    value = value.strip()

    # Convert "3.8 million" or similar to integer
    if 'million' in value.lower():
        try:
            return int(float(value.lower().replace('million', '').strip()) * 1_000_000)
        except ValueError:
            return np.nan

    # Replace dots and commas appropriately and convert to integer
    try:
        value = value.replace('.', '').replace(',', '')
        return int(value)
    except ValueError:
        return np.nan

# Apply cleaning function to the column
df['total_visitors_cleaned'] = df['total_visitors'].apply(clean_visitors)

In [37]:
df.dropna(subset=['total_visitors_cleaned'], inplace=True)
df['total_visitors_cleaned'] = df['total_visitors_cleaned'].astype(int)

In [38]:
df.head()

Unnamed: 0,name,visitors_count,city,country,total_visitors,total_visitors_cleaned
0,Louvre,"8,700,000 (2024)[1]",Paris,France,8700000,8700000
1,Vatican Museums,"6,800,000 (2023)[2]","Vatican City, Rome",Vatican,6800000,6800000
2,National Museum of China,"6,765,000 (2023)[3]",Beijing,China,6765000,6765000
3,British Museum,"5,820,860 (2023)[4]",London,United Kingdom,5820860,5820860
4,Natural History Museum,"5,688,608 (2023)[5]",London,United Kingdom,5688608,5688608


In [48]:
# Initiate the nominatim geocoder
geolocator = Nominatim(user_agent = 'monument_geocoder')
# Create a rate limiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds = 2)

In [None]:
df['Full address'] = 0
df['Latitude'] = 0
df['Longitude'] = 0

for index, row in df.iterrows():
  # print(row['address_1'])
  addr = f"{row['name']}, {row['city']}, {row['country']}"
  location_info = geocode(addr)

  try:

    # saving the address, lat, lon information into the corresponding variables during each iteration
    full_addres = location_info.address
    lat = location_info.latitude
    lon = location_info.longitude

    df.loc[index, 'Full address' ] = full_addres
    df.loc[index, 'Latitude' ] = lat
    df.loc[index, 'Longitude' ] = lon

    print(f'We have gotten the data for {full_addres}')

  except AttributeError:
    df.loc[index, 'Full address' ] = 'No Info'
    df.loc[index, 'Latitude' ] = 'No Info'
    df.loc[index, 'Longitude' ] = 'No Info'

In [51]:
df[df['Full address'].str.contains('No Info', na = False)]

Unnamed: 0,name,visitors_count,city,country,total_visitors,total_visitors_cleaned,latitude,longitude,full_address,Full address,Latitude,Longitude
1,Vatican Museums,"6,800,000 (2023)[2]","Vatican City, Rome",Vatican,6800000,6800000,,,,No Info,No Info,No Info
16,Musée National d'Histoire Naturelle,3.8 million (2023)[17],Paris,France,3.8 million,3800000,,,,No Info,No Info,No Info
19,Palace and Garden of Sa'dabad,"3,121,000 (2022)[20]",Tehran,Iran,3121000,3121000,,,,No Info,No Info,No Info
21,National Air and Space Museum[a],"3,100,000 (2023)[22]","Washington, D.C.",United States,3100000,3100000,,,,No Info,No Info,No Info
24,National Museum of Anthropology,"2,636,352 (2023)",Mexico City,Mexico,2636352,2636352,,,,No Info,No Info,No Info
28,Smithsonian Museum of American History,"2,100,000 (2023)[27]","Washington, D.C.",United States,2100000,2100000,,,,No Info,No Info,No Info
32,State Russian Museum,"2,900,000 (2023)[31]",Saint Petersburg,Russia,2900000,2900000,,,,No Info,No Info,No Info
33,National Museum of Marine Science and Technology,"2,726,000 (2023)[32]",Keelung,Taiwan,2726000,2726000,,,,No Info,No Info,No Info
34,London Science Museum,"2,957,000 (2023) [33]",London,United Kingdom,2957000,2957000,,,,No Info,No Info,No Info
36,Chinese Aviation Museum,"2,300,000 (2022)[35]",Beijing,China,2300000,2300000,,,,No Info,No Info,No Info


In [52]:
df = df[~df['Full address'].str.contains('No Info', na = False)]
df

Unnamed: 0,name,visitors_count,city,country,total_visitors,total_visitors_cleaned,latitude,longitude,full_address,Full address,Latitude,Longitude
0,Louvre,"8,700,000 (2024)[1]",Paris,France,8700000,8700000,48.861147,2.338028,"Musée du Louvre, Cour Carrée, Quartier Saint-G...","Musée du Louvre, Cour Carrée, Quartier Saint-G...",48.861147,2.338028
2,National Museum of China,"6,765,000 (2023)[3]",Beijing,China,6765000,6765000,,,,"中国国家博物馆, 16, 东长安街, 东华门街道, 首都功能核心区, 东城区, 北京市, 1...",39.903746,116.39539
3,British Museum,"5,820,860 (2023)[4]",London,United Kingdom,5820860,5820860,,,,"British Museum, Great Russell Street, Saint Gi...",51.519294,-0.128018
4,Natural History Museum,"5,688,608 (2023)[5]",London,United Kingdom,5688608,5688608,,,,"Natural History Museum, Cromwell Road, Brompto...",51.496511,-0.176002
5,Metropolitan Museum of Art,"5,400,000 (2023)[6]",New York City,United States,5400000,5400000,,,,"The Metropolitan Museum of Art, 1000, 5th Aven...",40.77944,-73.963382
...,...,...,...,...,...,...,...,...,...,...,...,...
71,Museo Nazionale di Castel Sant'Angelo,"1,321,834[58]",Rome,Italy,1321834,1321834,,,,"Museo Nazionale di Castel Sant’Angelo, 50, Lun...",41.90308,12.466181
73,Palacio de Cristal del Retiro,"1,318,823[b][41]",Madrid,Spain,1318823,1318823,,,,Invernadero del Palacio de Cristal de Arganzue...,40.393491,-3.700241
75,Scottish National Gallery,"1,277,230[41]",Edinburgh,United Kingdom,1277230,1277230,,,,"National Gallery, The Mound, Old Town, City of...",55.950881,-3.195619
76,Museo Reina Sofía,"1,253,183[c][41]",Madrid,Spain,1253183,1253183,,,,"Museo Nacional Centro de Arte Reina Sofía, 52,...",40.40805,-3.694422


In [55]:
df.drop('full_address', axis=1, inplace=True)

In [54]:
import folium

In [58]:
# initiate a folium map
idx = 0
m = folium.Map(location = [df.loc[idx, 'Latitude'], df.loc[idx, 'Longitude']], zoom_start = 10)

In [60]:
for index, row in df.iterrows():

  # Info to display in popup
  popup_info = f"""
  Name: {row['name']}<br><br>
  Full Address: {row['Full address']}<br><br>
  Latitude: {row['Latitude']}<br><br>
  Longitude: {row['Longitude']}
  """

  # Creating a marker
  folium.Marker(
      location = [row['Latitude'], row['Longitude']],
      popup = folium.Popup(popup_info, max_width = 300)
  ).add_to(m)


In [61]:
m