<a href="https://colab.research.google.com/github/Reben80/Data110-22016/blob/main/Week12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.express as px


In [None]:
world_countries_data = pd.read_csv('https://raw.githubusercontent.com/Reben80/Data110-22016/refs/heads/main/dataset/world_countries.csv')

# Display the first few rows of the dataset to understand its structure
world_countries_data.head()


In [None]:

# Simplify data for mapping
mapping_data = world_countries_data.rename(columns={"Country": "country", "GDP": "gdp", "Population": "population"})

# Create a choropleth map using Plotly
fig = px.choropleth(
    mapping_data,
    locations="country",
    locationmode="country names",
    color="gdp",
    hover_name="country",
    title="Choropleth Map of GDP by Country",
    color_continuous_scale=px.colors.sequential.Viridis,
)

# Show the interactive map
fig.show()


# Create a choropleth map using Plotly
fig = px.choropleth(
    mapping_data,
    locations="country",
    locationmode="country names",
    color="population",
    hover_name="country",
    title="Choropleth Map of GDP by Country",
    color_continuous_scale=px.colors.sequential.Viridis,
)

# Show the interactive map
fig.show()


In [None]:


# Load the data
data = world_countries_data

# Aggregate GDP by Region
# The .index attribute is not callable. Access it directly instead.
gdp_by_region = data.groupby('Region')['GDP'].sum().index

# Reset the index to access 'Region' and 'GDP' as columns
gdp_by_region = data.groupby('Region')['GDP'].sum().reset_index()


# Plot the data
plt.figure(figsize=(12, 6))
# Use the 'Region' and 'GDP' columns for plotting
plt.barh(gdp_by_region['Region'], gdp_by_region['GDP'], color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Region')
plt.ylabel('Total GDP')
plt.title('Total GDP by Region')
plt.tight_layout()
plt.show()

In [None]:
# Aggregate Population Density by Region
density_by_region = data.groupby('Region')['Pop. Density'].mean().reset_index()

# Plot the data
plt.figure(figsize=(12, 6))
plt.bar(density_by_region['Region'], density_by_region['Pop. Density'], color='coral')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Region')
plt.ylabel('Average Population Density')
plt.title('Average Population Density by Region')
plt.tight_layout()
plt.show()


In [None]:
# Aggregate Population by Region
population_by_region = data.groupby('Region')['Population'].sum().reset_index()

# Plot the data as a pie chart
plt.figure(figsize=(8, 8))
plt.pie(population_by_region['Population'], labels=population_by_region['Region'], autopct='%1.1f%%', startangle=140)
plt.title('Population Distribution by Region')
plt.axis('equal')
plt.show()


In [None]:
# Aggregate Population by Region
population_by_region = world_countries_data.groupby('Region')['Population'].sum().reset_index()

# Plot the data as a bar graph
plt.figure(figsize=(12, 6))
plt.bar(population_by_region['Region'], population_by_region['Population'], color='skyblue')
plt.title('Population Distribution by Region', fontsize=16)
plt.xlabel('Region', fontsize=12)
plt.ylabel('Total Population', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
#  Population by Region and sort by population in descending order
population_by_region = population_by_region.sort_values(by='Population', ascending=True)  # Sort by Population in descending order

# Plot the data as a bar graph
plt.figure(figsize=(12, 6))
plt.barh(population_by_region['Region'], population_by_region['Population'], color='skyblue')
plt.title('Population Distribution by Region (Descending)', fontsize=16)
plt.xlabel('Region', fontsize=12)
plt.ylabel('Total Population', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
!pip install squarify

In [None]:
import squarify
import matplotlib.pyplot as plt

# Prepare the data
treemap_data = world_countries_data[['Country', 'Population']].dropna()
treemap_data['Population'] = treemap_data['Population'] / 1e6  # Convert to millions
treemap_data = treemap_data.nlargest(15, 'Population')  # Get top 10

# Create nice colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf','#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

# Create the plot
plt.figure(figsize=(12, 8))
squarify.plot(
    sizes=treemap_data['Population'],
    label=[f"{country}\n{pop:,.1f}M" for country, pop in  # Add commas to numbers
           zip(treemap_data['Country'], treemap_data['Population'])],
    color=colors,
    alpha=0.9,
    pad=False  # Add some space between rectangles
)

plt.title('Top 10 Countries by Population (Millions)', fontsize=14, pad=20)
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
import squarify


# Prepare data
population_by_region = world_countries_data.groupby('Region')['Population'].sum().reset_index()
population_by_region = population_by_region.sort_values('Population', ascending=False)

# Create nice colors for regions
colors = ['#2ecc71', '#3498db', '#9b59b6', '#e74c3c', '#f1c40f', '#1abc9c']

# Create the plot
plt.figure(figsize=(12, 8))
squarify.plot(
    sizes=population_by_region['Population'] / 1e9,  # Convert to billions
    label=[f"{region}\n{pop:.1f}B" for region, pop in  # Add population in billions
           zip(population_by_region['Region'], population_by_region['Population'] / 1e9)],
    color=colors,
    alpha=0.9,
    pad=False  # Add some space between rectangles
)

plt.title('Population Distribution by Region (Billions)', fontsize=14, pad=20)
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Prepare data for visualization
# Get top 5 countries per continent for cleaner visualization
top_n = 5  # Number of countries to show per continent

# Create hierarchical data
df = world_countries_data.copy()
total_population = df['Population'].sum()

# Prepare Sankey data
source = []  # Source indices
target = []  # Target indices
values = []  # Population values
labels = ['World']  # Start with World

# Add continents
continents = df['Region'].unique()
continent_dict = {continent: i+1 for i, continent in enumerate(continents)}
labels.extend(continents)

# World to continent flows
for continent in continents:
    source.append(0)  # World index
    target.append(continent_dict[continent])
    values.append(df[df['Region'] == continent]['Population'].sum())

# Continent to country flows
current_idx = len(continents) + 1
for continent in continents:
    # Get top N countries for this continent
    top_countries = df[df['Region'] == continent].nlargest(top_n, 'Population')

    for _, country_data in top_countries.iterrows():
        source.append(continent_dict[continent])
        target.append(current_idx)
        values.append(country_data['Population'])
        labels.append(f"{country_data['Country']}\n{country_data['Population']/1e6:.1f}M")
        current_idx += 1

# Create color scheme
node_colors = ['#2ecc71'] + \
              ['#3498db'] * len(continents) + \
              ['#e74c3c'] * (len(labels) - len(continents) - 1)

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = labels,
        color = node_colors
    ),
    link = dict(
        source = source,
        target = target,
        value = values,
        color = 'rgba(135, 206, 235, 0.4)'  # Light blue with transparency
    )
)])

# Update layout
fig.update_layout(
    title=dict(
        text="World Population Flow: World → Continents → Top 5 Countries per Continent",
        x=0.5,
        y=0.95
    ),
    font_size=12,
    height=800,
    width=1200
)

fig.show()

Or you can use https://sankeymatic.com/build/

In [None]:


# Example data for U.S. states
state_data = {
    'State': ['California', 'Texas', 'Florida', 'New York', 'Pennsylvania', 'Illinois', 'Ohio', 'Georgia', 'North Carolina', 'Michigan'],
    'Population': [39538223, 29145505, 21538187, 20201249, 13002700, 12812508, 11799448, 10711908, 10439388, 10077331],
    'GDP (Billion $)': [3310, 1970, 1171, 1680, 846, 890, 683, 683, 666, 608]
}

# Convert the data to a DataFrame
state_df = pd.DataFrame(state_data)

# Create a bar chart for state populations
fig_population = px.bar(
    state_df,
    x='State',
    y='Population',
    title='Population of Top 10 U.S. States',
    labels={'Population': 'Population (in millions)', 'State': 'U.S. State'},
    text_auto=True,
    color='Population',
    color_continuous_scale='Blues'
)
fig_population.show()

# Create a bubble chart for state GDP
fig_gdp = px.scatter(
    state_df,
    x='State',
    y='GDP (Billion $)',
    size='GDP (Billion $)',
    color='State',
    title='GDP of Top 10 U.S. States',
    labels={'GDP (Billion $)': 'GDP (in Billion $)', 'State': 'U.S. State'},
    size_max=60
)
fig_gdp.show()


In [None]:
import pandas as pd
import plotly.express as px

# Load the dataset from the GitHub URL
us_states_population = pd.read_csv('https://raw.githubusercontent.com/Reben80/Data110-22016/refs/heads/main/dataset/US_State_abbrev.csv')
us_states_population.head()


In [None]:
us_states_population.info()

In [None]:


# Create the choropleth map using Plotly Express
fig = px.choropleth(
    us_states_population,                # DataFrame containing the data
    locations='State Abbreviation',      # Column with state abbreviations for mapping
    locationmode="USA-states",           # Specifies the location mode to use U.S. states
    color='Population 2024',             # Column to color states based on population
    scope="usa",                         # Restrict the map to the USA
    color_continuous_scale="Viridis",    # Set a vibrant Viridis color scale for the map
    title="US States Population in 2024",  # Title for the map visualization
    labels={'Population 2024': 'Population'} # Customizes the colorbar label
)

# Display the map
fig.show()



In [None]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
                   dtype={"fips": str})

import plotly.express as px

fig = px.choropleth(df, geojson=counties, locations='fips', color='unemp',
                           color_continuous_scale="Viridis",
                           range_color=(0, 12),
                           scope="usa",
                           labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:


# Filter data to include only Maryland counties
df_maryland = df[df['fips'].str.startswith('24')]  # Maryland's state FIPS code is 24

# Create the choropleth map for Maryland
fig = px.choropleth(
    df_maryland,
    geojson=counties,
    locations='fips',
    color='unemp',
    color_continuous_scale="Viridis",
    range_color=(0, 12),
    scope="usa",
    labels={'unemp': 'Unemployment Rate'}
)

# Focus the map on Maryland by limiting the GeoJSON to Maryland counties
fig.update_geos(fitbounds="locations", visible=False)

# Adjust margins and display the map
fig.update_layout(
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    title="Unemployment Rate in Maryland Counties"
)
fig.show()


In [None]:


fig = px.choropleth(locations=["MD", "CA", "FL"], locationmode="USA-states", color=[1,2,3], scope="usa")
fig.show()