## Reading and exploring data

In [1]:
import numpy as np
import pandas as pd

# Specifying columns to read
columns_to_read = ['Country', 'Co2-Emissions', 'Life expectancy', 'Population', 'GDP', 'Physicians per thousand', 'CPI', 'Gasoline Price', 'Abbreviation', 'Unemployment rate','Latitude', 'Longitude']
continents_to_read = ['name', 'alpha-2', 'region']

# Read the CSV file with selected columns
countries = pd.read_csv('./world-data-2023.csv', usecols = columns_to_read)
continents = pd.read_csv('./continents2.csv', usecols = continents_to_read)

# Merge two tables on 'Abbreviation' and 'alpha-2'
data = pd.merge(countries, continents, left_on=['Abbreviation'], right_on=['alpha-2'], how='left')

# Drop redundant columns
data.drop(['name', 'alpha-2'], axis=1, inplace=True)
data.rename(columns={'region': 'Region'}, inplace=True)

data = data.dropna()

# Convert 'Population' to numeric (remove commas)
data['Population'] = data['Population'].str.replace(',', '').astype(int)
# Convert 'GDP' to numeric (remove $ and commas)
# data['GDP'] = data['GDP'].str.replace('$', '').str.replace(',', '').astype(int)
data['GDP'] = data['GDP'].str.replace('$', '').str.replace(',', '').astype('Int64')
data['CPI'] = data['CPI'].str.replace(',', '').astype('float64')

# Convert 'Co2-Emissions' to numeric (remove commas)
data['Co2-Emissions'] = data['Co2-Emissions'].str.replace(',', '').astype(int)
# Convert 'Gasoline Price' to numeric (remove $ and commas)
data['Gasoline Price'] = data['Gasoline Price'].str.replace('$', '').str.replace(',', '').astype(float)

# Calculate CO2 emissions per million people
data['Co2-Emissions per million people'] = (data['Co2-Emissions'] / data['Population']) * 1000000
# Calculate GDP per capita
data['GDP per capita'] = data['GDP'] / data['Population']


# Convert 'Unemployment rate' to numeric (remove % and convert to float)
data['Unemployment rate'] = data['Unemployment rate'].str.replace('%', '').astype(float)

# Viewing sample data
data.head()

Unnamed: 0,Country,Abbreviation,Co2-Emissions,CPI,Gasoline Price,GDP,Life expectancy,Physicians per thousand,Population,Unemployment rate,Latitude,Longitude,Region,Co2-Emissions per million people,GDP per capita
0,Afghanistan,AF,8672,149.9,0.7,19101353833,64.5,0.28,38041754,11.12,33.93911,67.709953,Asia,227.960046,502.115487
1,Albania,AL,4536,119.05,1.36,15278077447,78.5,1.2,2854191,12.33,41.153332,20.168331,Europe,1589.241925,5352.857411
2,Algeria,DZ,150006,151.36,0.28,169988236398,76.7,1.72,43053054,11.7,28.033886,1.659626,Africa,3484.212758,3948.343279
4,Angola,AO,34693,261.73,0.97,94635415870,60.8,0.21,31825295,6.89,-11.202692,17.873887,Africa,1090.107727,2973.59116
6,Argentina,AR,201348,232.75,1.1,449663446954,76.5,3.96,44938712,9.79,-38.416097,-63.616672,Americas,4480.502245,10006.148974


## Altair

### Main System Visualisations

In [2]:
attributes = ['Population', 'GDP', 'GDP per capita', 'Co2-Emissions', 'Gasoline Price', 'Life expectancy', 'Unemployment rate', 'Physicians per thousand', 'CPI']
tooltip=['Region', 'Population', 'Life expectancy', 'Unemployment rate, GDP per capita', 'Gasoline Price', 'CPI']
countries_data = data.copy()

In [3]:
import altair as alt
import pandas as pd

# Load dataset
df = countries_data

# Dropdown for selecting the attribute
attribute_dropdown = alt.binding_select(options=attributes, name="Select Attribute: ")
attribute_selection = alt.param(name="attribute", bind=attribute_dropdown, value="Population")

# Radio button for sorting order
sort_radio = alt.binding_radio(options=["Ascending", "Descending"], name="Sort Order: ")
sort_selection = alt.param(name="sort_order", bind=sort_radio, value="Descending")

# Slider for the number of displayed countries
num_countries_slider = alt.binding_range(min=1, max=len(df), step=1, name="Max Display: ")
num_countries_param = alt.param(name="num_countries", value=55, bind=num_countries_slider)

# Slider for filtering by life expectancy
life_expectancy_slider = alt.binding_range(min=df['Life expectancy'].min(), max=df['Life expectancy'].max(), step=1, name="Life Expectancy: ")
life_expectancy_param = alt.param(name="life_expectancy", value=df['Life expectancy'].min(), bind=life_expectancy_slider)

# Slider for filtering by unemployment rate
unemployment_rate_slider = alt.binding_range(min=df['Unemployment rate'].min(), max=df['Unemployment rate'].max(), step=1, name="Unemployment Rate: ")
unemployment_rate_param = alt.param(name="unemployment_rate", value=df['Unemployment rate'].max(), bind=unemployment_rate_slider)

# Slider for filtering by GDP per capita
gdp_per_capita_slider = alt.binding_range(min=df['GDP per capita'].min(), max=df['GDP per capita'].max(), step=1, name="GDP per Capita: ")
gdp_per_capita_param = alt.param(name="gdp_per_capita", value=df['GDP per capita'].min(), bind=gdp_per_capita_slider)

# Multi-select for region filtering via the legend
region_selection = alt.selection_multi(fields=['Region'], bind='legend')

# Selection for countries (linked to both charts)
country_selection = alt.selection_multi(fields=['Country'], empty="all")

# Create an equal-height stack by assigning each country the same fraction
df["EqualHeight"] = 1  # Assign a fixed height to each country

# Stacked bar chart with equal segment sizes
stacked_chart = alt.Chart(df).transform_filter(
    (alt.datum['Life expectancy'] >= life_expectancy_param) & 
    (alt.datum['Unemployment rate'] <= unemployment_rate_param) & 
    (alt.datum['GDP per capita'] >= gdp_per_capita_param)  # Filter by life expectancy, unemployment rate, and GDP per capita
).mark_bar().encode(
    x=alt.X("Region:N", title="Region", axis=alt.Axis(labelAngle=-45)),  # Rotate region labels
    y=alt.Y("EqualHeight:Q", stack="normalize", title="Countries (Equal Sized Segments)"),
    color=alt.condition(country_selection, alt.value("darkred"), alt.Color("Country:N", legend=None)),  # Highlight selected country
    stroke=alt.condition(country_selection, alt.value("black"), alt.value(None)),  # Add a stroke to selected countries
    opacity=alt.condition(region_selection & country_selection, alt.value(1), alt.value(0.3)),  # Lower opacity for unselected
    strokeWidth=alt.condition(country_selection, alt.value(2), alt.value(0)),  # Make stroke noticeable only when selected
    tooltip=["Country", "Region"],
).properties(
    width=1000,  # Ensure enough width
    height=600
).add_selection(
    country_selection
)

# Text labels centered in the segment
text_labels = alt.Chart(df).transform_filter(
    (alt.datum['Life expectancy'] >= life_expectancy_param) & 
    (alt.datum['Unemployment rate'] <= unemployment_rate_param) & 
    (alt.datum['GDP per capita'] >= gdp_per_capita_param)  # Filter by life expectancy, unemployment rate, and GDP per capita
).mark_text(size=11, fontWeight="bold").encode(
    x=alt.X("Region:N"),
    y=alt.Y("EqualHeight:Q", stack="normalize"),
    text="Country:N",
    color=alt.condition(country_selection, alt.value("white"), alt.value("black")),  # Highlight selected labels
    tooltip=["Country", "Region"]
).properties(
    width=1000  # Ensure matching width
).mark_text(
    align="center",  
    baseline="middle",  # Ensure text is vertically centered
    dy=8  # No vertical offset
)

# Main bar chart (filtered by selections)
bar_chart = alt.Chart(df).transform_filter(
    region_selection
).transform_filter(
    country_selection  # Apply country selection filter
).transform_filter(
    (alt.datum['Life expectancy'] >= life_expectancy_param) & 
    (alt.datum['Unemployment rate'] <= unemployment_rate_param) & 
    (alt.datum['GDP per capita'] >= gdp_per_capita_param)  # Filter by life expectancy, unemployment rate, and GDP per capita
).transform_calculate(
    SelectedAttribute="datum[attribute]"
).transform_window(
    asc_rank='rank()', sort=[{"field": "SelectedAttribute", "order": "ascending"}]
).transform_window(
    desc_rank='rank()', sort=[{"field": "SelectedAttribute", "order": "descending"}]
).transform_calculate(
    effective_rank="sort_order === 'Ascending' ? datum.asc_rank : datum.desc_rank"
).transform_filter(
    "datum.effective_rank <= num_countries"
).mark_bar().encode(
    x=alt.X('Country:N', sort=alt.EncodingSortField(field="effective_rank", order="ascending")),
    y=alt.Y('SelectedAttribute:Q', title="Value"),
    color=alt.Color('Region:N', legend=alt.Legend(title="Click to filter")),
    tooltip=['Country', 'Region', alt.Tooltip('SelectedAttribute:Q', title="Value")]
).add_params(
    attribute_selection, sort_selection, num_countries_param, life_expectancy_param, unemployment_rate_param, gdp_per_capita_param
).add_selection(
    region_selection
).properties(
    width=1000,  # Match the width of the stacked bar chart
    height=400
)

# Arrange stacked chart above the bar chart
chart = alt.vconcat(stacked_chart + text_labels, bar_chart).resolve_scale(
    color='independent'  # Ensure colors don’t interfere between charts
).configure_axis(grid=False)
chart

Deprecated since `altair=5.0.0`. Use selection_point instead.
  region_selection = alt.selection_multi(fields=['Region'], bind='legend')
Deprecated since `altair=5.0.0`. Use selection_point instead.
  country_selection = alt.selection_multi(fields=['Country'], empty="all")
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(


In [4]:
import altair as alt
import pandas as pd

# Disable row limit in Altair
alt.data_transformers.disable_max_rows()

# Sample dataset
# Assuming 'data' is your DataFrame

# Step 1: Convert to long format (MELT)
data_long = data.melt(id_vars=['Country', 'Region', 'Population'], var_name='Variable', value_name='Value')

# Step 2: Merge to create (Variable, Variable2) pairs
data_long = data_long.merge(data_long, on=['Country', 'Region', 'Population'])

# Step 3: Rename columns for clarity
data_long.rename(columns={'Variable_x': 'X_Variable', 'Value_x': 'X_Value',
                          'Variable_y': 'Y_Variable', 'Value_y': 'Y_Value'}, inplace=True)

# Step 4: Create dropdown menus
x_dropdown = alt.binding_select(options=data_long['X_Variable'].unique().tolist(), name='X Axis')
y_dropdown = alt.binding_select(options=data_long['Y_Variable'].unique().tolist(), name='Y Axis')

x_param = alt.param(name='x_axis', bind=x_dropdown, value='Life expectancy')
y_param = alt.param(name='y_axis', bind=y_dropdown, value='Physicians per thousand')

# Step 5: Create a selection for the region
region_selection = alt.selection_multi(fields=['Region'], bind='legend')

# Step 6: Create the scatter plot with dynamic axes and region selection
scatter_plot = alt.Chart(data_long).transform_filter(
    (alt.datum.X_Variable == x_param) & (alt.datum.Y_Variable == y_param)
).mark_circle().encode(
    x=alt.X('X_Value:Q', scale=alt.Scale(type='linear'), title='X-Axis'),
    y=alt.Y('Y_Value:Q', scale=alt.Scale(type='linear'), title='Y-Axis'),
    color=alt.Color('Region:N', title='Region'),
    size=alt.Size('Population:Q', title='Population', scale=alt.Scale(range=[10, 1000])),
    tooltip=['Country:N', 'Region:N', 'X_Value:Q', 'Y_Value:Q'],
    opacity=alt.condition(region_selection, alt.value(1), alt.value(0.2))
).add_params(x_param, y_param).add_selection(
    region_selection
).properties(
    title='Interactive Scatter Plot',
    width=800, height=500
).interactive()

scatter_plot

Deprecated since `altair=5.0.0`. Use selection_point instead.
  region_selection = alt.selection_multi(fields=['Region'], bind='legend')
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_params(x_param, y_param).add_selection(


In [5]:
import altair as alt
import pandas as pd

# attributes2 = attributes
# attributes2 = ['GDP per capita', 'Life expectancy', 'Physicians per thousand']
attributes2 = ['GDP per capita', 'Life expectancy', 'Unemployment rate', 'CPI']

# Sample data
df = countries_data

# Create individual charts
charts = []
is_ascending = False
sort_y = '-x'
for attribute in attributes2:
    if attribute == 'Unemployment rate' or attribute == 'Gasoline Price' or attribute == 'CPI':
        is_ascending = True
        sort_y = 'x'
    top_10_data = data.sort_values(attribute, ascending=is_ascending).head(10)
    bar_chart = alt.Chart(top_10_data).mark_bar().encode(
        x=alt.X(f'{attribute}:Q', title=f'{attribute}'),
        y=alt.Y('Country:N', sort=sort_y, title='Country'),
        color='Region:N',
        opacity=alt.condition(alt.selection_interval(name='brush'), alt.value(1), alt.value(0.2)),
        tooltip=['Country:N', f'{attribute}:Q', 'Region:N']
    ).properties(
        title=f'Top 10 Countries by {attribute}',
        width=900,
        height=100
    )
    charts.append(bar_chart)

# Add a region selection filter
region_selection = alt.selection_single(
    fields=['Region'],
    bind='legend',
    name='Region Selection'
)

# Combine charts into a single chart with cross-filtering and region filtering
brush = alt.selection_interval(name='brush', encodings=['y'], resolve='global')

combined_chart = alt.vconcat(
    *[chart.add_selection(brush).encode(
        color=alt.condition(region_selection, 'Region:N', alt.value('lightgrey')),
        opacity=alt.condition(brush, alt.value(1), alt.value(0.2))
      ).add_selection(region_selection)
      .transform_filter(region_selection) for chart in charts]
).configure_view(
    stroke='transparent'
)

combined_chart

Deprecated since `altair=5.0.0`. Use selection_point instead.
  region_selection = alt.selection_single(
Deprecated since `altair=5.0.0`. Use add_params instead.
  *[chart.add_selection(brush).encode(
Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(region_selection)


In [None]:
import altair as alt

# Assuming you have your charts defined as chart1, chart2, and combined_chart

# Save each chart as an HTML file
chart.save('chart1.html')
scatter_plot.save('chart2.html')
combined_chart.save('chart3.html')

In [None]:
chart1_json = chart.to_json()
chart2_json = scatter_plot.to_json()
combined_chart_json = combined_chart.to_json()

with open('chart1.json', 'w') as f:
    f.write(chart1_json)
with open('chart2.json', 'w') as f:
    f.write(chart2_json)
with open('chart3.json', 'w') as f:
    f.write(combined_chart_json)