Lets start with the dataset and geodata.

First loading the dataframe and have a look.

In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/geodata-for-exercises/cities_world.tsv", sep='\t')
print(df.head())

             City  Latitude  Longitude      Country  Population  Land_area  \
0  Tokyo/Yokohama   35.6895   139.6917        Japan    33200000       6993   
1  New York Metro   40.7128   -74.0059          USA    17800000       8683   
2       Sao Paulo  -23.5505   -46.6333       Brazil    17700000       1968   
3   Seoul/Incheon   37.5665   126.9780  South Korea    17500000       1049   
4     Mexico City   23.6345  -102.5528       Mexico    17400000       2072   

   Density  Number  
0     4750       1  
1     2050       2  
2     9000       3  
3    16700       4  
4     8400       5  


I also like to add the continent as an information to the dataset.


In [2]:
pip install pycountry_convert

Collecting pycountry_convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl.metadata (7.2 kB)
Collecting pprintpp>=0.3.0 (from pycountry_convert)
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl.metadata (7.9 kB)
Collecting pycountry>=16.11.27.1 (from pycountry_convert)
  Downloading pycountry-23.12.11-py3-none-any.whl.metadata (12 kB)
Collecting pytest-mock>=1.6.3 (from pycountry_convert)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pytest-cov>=2.5.1 (from pycountry_convert)
  Downloading pytest_cov-5.0.0-py3-none-any.whl.metadata (27 kB)
Collecting repoze.lru>=0.7 (from pycountry_convert)
  Downloading repoze.lru-0.7-py3-none-any.whl.metadata (1.1 kB)
Collecting coverage>=5.2.1 (from coverage[toml]>=5.2.1->pytest-cov>=2.5.1->pycountry_convert)
  Downloading coverage-7.5.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Downloading pycountry_convert-0.7.2-py3-no

In [3]:
import pandas as pd
from tqdm import tqdm
import numpy as np  # Import numpy for handling NaN values
import pycountry_convert as pc

### Function to get continent name from country name
def get_continent(country_name: str) -> str:
    try:
        # Convert country name to ISO-2 country code
        country_code = pc.country_name_to_country_alpha2(country_name, cn_name_format="default")
        # Convert ISO-2 country code to continent code
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        # Convert continent code to continent name
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
    except Exception as e:
        print(f"Error fetching continent for {country_name}: {e}")
        continent_name = 'Unknown'
    return continent_name

df = pd.read_csv('/kaggle/input/geodata-for-exercises/cities_world.tsv', sep='\t')

# Replace NaN values with 'Unknown' for City and Country
df['City'].fillna('Unknown', inplace=True)
df['Country'].fillna('Unknown', inplace=True)

# Add a new column for continent
df['Continent'] = 'Unknown'

# Initialize tqdm progress bar
with tqdm(total=len(df), desc="Processing rows") as pbar:
    # Process each row in the DataFrame
    for index, row in df.iterrows():
        country = row['Country']
        
        if country != 'Unknown':
            continent = get_continent(country)
            print(f"Country: {country}, Continent: {continent}")  # Debug statement
            # Add location information to the DataFrame
            df.at[index, 'Continent'] = continent
        
        # Update the tqdm progress bar
        pbar.update(1)

print(df.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Country'].fillna('Unknown', inplace=True)
Processing rows: 100%|██████████| 125/125 [00:00<00:00, 1925.53it/s]

Country: Japan, Continent: Asia
Country: USA, Continent: North America
Country: Brazil, Continent: South America
Country: South Korea, Continent: Asia
Country: Mexico, Continent: North America
Country: Japan, Continent: Asia
Country: Philippines, Continent: Asia
Country: India, Continent: Asia
Country: India, Continent: Asia
Country: Indonesia, Continent: Asia
Country: Nigeria, Continent: Africa
Country: India, Continent: Asia
Country: Egypt, Continent: Africa
Country: USA, Continent: North America
Country: Argentina, Continent: South America
Country: Brazil, Continent: South America
Country: Russia, Continent: Europe
Country: China, Continent: Asia
Country: Pakistan, Continent: Asia
Country: France, Continent: Europe
Country: Turkey, Continent: Asia
Country: Japan, Continent: Asia
Country: China, Continent: Asia
Country: USA, Continent: North America
Error fetching continent for UK: argument of type 'functools._lru_cache_wrapper' is not iterable
Country: UK, Continent: Unknown
Country




df.to_csv('/kaggle/working/cities_world_with_continent.csv', index=False)

seems good.

Now, do some sunburst charts and a treemap.

In [4]:
gr_cat = df[["Continent",
             "Country"]].groupby(["Continent",
                                       "Country"], as_index=False).size()

In [5]:
import plotly.express as px

fig = px.sunburst(gr_cat, width=1280, height=800,
                  path=["Continent", "Country"], values="size",
                  color="Continent",
                  title="<span style='font-size:18px;'><b>Continents and Countries in the dataset</b></span><b></b>"
                  )
fig.update_layout(font_size=10, margin=dict(l=10, r=10, t=30, b=50))
fig.update_traces(textinfo="label+percent parent")
fig.show()

In [6]:
gr2_cat = df.groupby(['Continent', 'Country'], as_index=False).agg({'Population': 'sum'})

In [7]:
import plotly.express as px

# Ensure your DataFrame has the necessary columns
# gr_cat should be the DataFrame containing the columns "Continent", "Country", "City", and "Population"

fig = px.sunburst(gr2_cat, 
                  width=1280, 
                  height=800,
                  path=["Continent", "Country"],  # Include "City" in the path
                  values="Population",  # Use "Population" for the values
                  color="Continent",
                  title="<span style='font-size:18px;'><b>Continents and Countries by Population</b></span><b></b>"
                  )

fig.update_layout(font_size=10, margin=dict(l=10, r=10, t=30, b=50))
fig.update_traces(textinfo="label+percent parent")
fig.show()


In [8]:
import pandas as pd
import plotly.express as px


gr3_cat = df.groupby(["Country", "City", "Continent"], as_index=False).agg({'Population': 'sum'})

fig = px.sunburst(gr3_cat, 
                  width=1280, 
                  height=800,
                  path=["Country", "City"],  # Include "Country" and "City" in the path
                  values="Population",  # Use "Population" for the values
                  color="Continent",  # Use "Continent" for color
                  title="<span style='font-size:18px;'><b>Continents, Countries, and Cities by Population</b></span><b></b>"
                  )

fig.update_layout(font_size=10, margin=dict(l=10, r=10, t=30, b=50))
fig.update_traces(textinfo="label+percent parent")
fig.show()


Hard to read

Let's do the continents by population (countries and cities) individually.

Starting with Europe

In [9]:
import pandas as pd
import plotly.express as px


europe_df = df[df['Continent'] == 'Europe']

# Group by Country and City, summing the Population
gr3_cat = europe_df.groupby(["Country", "City"], as_index=False).agg({'Population': 'sum'})

fig = px.sunburst(gr3_cat, 
                  width=1280, 
                  height=800,
                  path=["Country", "City"],  # Include "Country" and "City" in the path
                  values="Population",  # Use "Population" for the values
                  color="Country",  # Use "Country" for color
                  title="<span style='font-size:18px;'><b>Countries and Cities in Europe by Population</b></span><b></b>"
                  )

fig.update_layout(font_size=10, margin=dict(l=10, r=10, t=30, b=50))
fig.update_traces(textinfo="label+percent parent")
fig.show()


In [10]:
import pandas as pd
import plotly.express as px


north_america_df = df[df['Continent'] == 'North America']

# Group by Country and City, summing the Population
gr4_cat = north_america_df.groupby(["Country", "City"], as_index=False).agg({'Population': 'sum'})

fig = px.sunburst(gr4_cat, 
                  width=1280, 
                  height=800,
                  path=["Country", "City"],  
                  values="Population",  
                  color="Country",  
                  title="<span style='font-size:18px;'><b>Countries and Cities in North America by Population</b></span><b></b>"
                  )

fig.update_layout(font_size=10, margin=dict(l=10, r=10, t=30, b=50))
fig.update_traces(textinfo="label+percent parent")
fig.show()


In [11]:
fig = px.treemap(gr_cat, width=1280, height=800,
                 path=['Continent', 'Country'], values='size',
                 color='Continent')
fig.update_traces(textinfo="label+percent parent")
fig.show()