In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import json

In [2]:
# Get the url
url='https://en.wikipedia.org/wiki/World_Heritage_Sites_by_country'
page=requests.get(url)
page.encoding='utf-8'
# Beautify
soup=BeautifulSoup(page.text, 'html')

In [3]:
# Extract the table with the cultural heritage data
table=soup.find_all('tr')
world_titles=table[0]
world_titles=world_titles.find_all('th')
world_titles=[i.text.strip() for i in world_titles]
titles = [re.sub(r'\[.*?\]', '',i) for i in world_titles]
print(titles)

['Country', 'Cultural sites', 'Natural sites', 'Mixed sites', 'Total sites', 'Shared sites', 'UNESCO region']


In [4]:
columns= [x.upper() for x in titles]
df=pd.DataFrame(columns=columns)
df

Unnamed: 0,COUNTRY,CULTURAL SITES,NATURAL SITES,MIXED SITES,TOTAL SITES,SHARED SITES,UNESCO REGION


In [5]:
column_data=soup.find_all('tr')
for i in range(1,170):    # There are 168 countries
    row_data = column_data[i]
    individual_data=[data.text.strip() for data in row_data]
    individual_row_data= [re.sub(r'\[.*?\]', '',row) for row in individual_data]
    heritage = [element for index, element in enumerate(  individual_row_data) if index not in [0, 2, 4, 6, 8, 10, 12]]
    length = len(df)
    df.loc[length] = heritage
df.head(5)    

Unnamed: 0,COUNTRY,CULTURAL SITES,NATURAL SITES,MIXED SITES,TOTAL SITES,SHARED SITES,UNESCO REGION
0,Afghanistan,2,,,2,,Asia and the Pacific
1,Albania,2,1.0,1.0,4,2.0,Europe and North America
2,Algeria,6,,1.0,7,,Arab States
3,Andorra,1,,,1,,Europe and North America
4,Angola,1,,,1,,Africa


In [6]:
# Sort the data frame by UNESCO REGION
df = df.sort_values(by='UNESCO REGION').reset_index(drop=True)
df.head()

Unnamed: 0,COUNTRY,CULTURAL SITES,NATURAL SITES,MIXED SITES,TOTAL SITES,SHARED SITES,UNESCO REGION
0,Zimbabwe,3.0,2.0,,5,1.0,Africa
1,Central African Republic,,2.0,,2,1.0,Africa
2,Mozambique,1.0,,,1,,Africa
3,Seychelles,,2.0,,2,,Africa
4,Senegal,5.0,2.0,,7,1.0,Africa


In [7]:
df.to_csv('C:/Users/PC/Desktop/Projects/Cultural Heritage/Cultural_Heritage_Count.csv', index=False)

## Pick the UNESCO REGION, COUNTRY & TOTAL SITES columns 

In [8]:
SiteCount=df[['UNESCO REGION', 'COUNTRY', 'TOTAL SITES']]
SiteCount.head()

Unnamed: 0,UNESCO REGION,COUNTRY,TOTAL SITES
0,Africa,Zimbabwe,5
1,Africa,Central African Republic,2
2,Africa,Mozambique,1
3,Africa,Seychelles,2
4,Africa,Senegal,7


In [9]:
SiteCount['UNESCO REGION'].unique()

array(['Africa', 'Arab States', 'Asia and the Pacific',
       'Europe and North America', 'Latin America & the Caribbean'],
      dtype=object)

In [10]:
# Group by 'Continent' and aggregate data

grouped_totalsites= SiteCount.groupby('UNESCO REGION').apply(lambda x: x[['COUNTRY', 'TOTAL SITES']].to_dict(orient='records')).reset_index(name='children')

# Define colors for each continent (you can customize these colors as needed)
colors = {
    'Africa': '#f58321',
    'Arab States': '#ef1621',
    'Asia and the Pacific': '#77bc45',
    'Europe and North America': '#4aaaea',
    'Latin America & the Caribbean': '#00acad'
}

# Create the final JSON structure
json_data = {
    "name": "world",
    "children": [
        {
            "name": group['UNESCO REGION'],
            "color": colors[group['UNESCO REGION']],
            "children": group['children']
        }
        for _, group in grouped_totalsites.iterrows()
    ]
}

# Define the file path where you want to save the JSON file
file_path = r'C:\Users\PC\Desktop\Projects\Cultural Heritage\json_data.json'

# Save json_data to a JSON file
with open(file_path, 'w') as f:
    json.dump(json_data, f, indent=4)


# Print or save the JSON structure
print(json.dumps(json_data, indent=4))


{
    "name": "world",
    "children": [
        {
            "name": "Africa",
            "color": "#f58321",
            "children": [
                {
                    "COUNTRY": "Zimbabwe",
                    "TOTAL SITES": "5"
                },
                {
                    "COUNTRY": "Central African Republic",
                    "TOTAL SITES": "2"
                },
                {
                    "COUNTRY": "Mozambique",
                    "TOTAL SITES": "1"
                },
                {
                    "COUNTRY": "Seychelles",
                    "TOTAL SITES": "2"
                },
                {
                    "COUNTRY": "Senegal",
                    "TOTAL SITES": "7"
                },
                {
                    "COUNTRY": "Rwanda",
                    "TOTAL SITES": "2"
                },
                {
                    "COUNTRY": "C\u00f4te d'Ivoire",
                    "TOTAL SITES": "5"
                },
 