# Chapter 7: Data Visualization with Altair

** Note not all the graphs render on GitHub Web, a Github-related issue. **

# Introduction

## Starting with Exploratory Data Analysis (EDA)

In [62]:
import altair as alt
import polars as pl
from vega_datasets import data

In [63]:
print(len(data.list_datasets()))
data.list_datasets()

70


['7zip',
 'airports',
 'annual-precip',
 'anscombe',
 'barley',
 'birdstrikes',
 'budget',
 'budgets',
 'burtin',
 'cars',
 'climate',
 'co2-concentration',
 'countries',
 'crimea',
 'disasters',
 'driving',
 'earthquakes',
 'ffox',
 'flare',
 'flare-dependencies',
 'flights-10k',
 'flights-200k',
 'flights-20k',
 'flights-2k',
 'flights-3m',
 'flights-5k',
 'flights-airport',
 'gapminder',
 'gapminder-health-income',
 'gimp',
 'github',
 'graticule',
 'income',
 'iowa-electricity',
 'iris',
 'jobs',
 'la-riots',
 'londonBoroughs',
 'londonCentroids',
 'londonTubeLines',
 'lookup_groups',
 'lookup_people',
 'miserables',
 'monarchs',
 'movies',
 'normal-2d',
 'obesity',
 'ohlc',
 'points',
 'population',
 'population_engineers_hurricanes',
 'seattle-temps',
 'seattle-weather',
 'sf-temps',
 'sp500',
 'stocks',
 'udistrict',
 'unemployment',
 'unemployment-across-industries',
 'uniform-2d',
 'us-10m',
 'us-employment',
 'us-state-capitals',
 'volcano',
 'weather',
 'weball26',
 'wheat',

### Figure 7.1

In [64]:
raw_df = pl.from_pandas(data.seattle_weather())
plot_df = raw_df  # Add additional transforms as necessary
plot_df.head()

date,precipitation,temp_max,temp_min,wind,weather
datetime[ns],f64,f64,f64,f64,str
2012-01-01 00:00:00,0.0,12.8,5.0,4.7,"""drizzle"""
2012-01-02 00:00:00,10.9,10.6,2.8,4.5,"""rain"""
2012-01-03 00:00:00,0.8,11.7,7.2,2.3,"""rain"""
2012-01-04 00:00:00,20.3,12.2,5.6,4.7,"""rain"""
2012-01-05 00:00:00,1.3,8.9,2.8,6.1,"""rain"""


## Built-in Line chart: `.plot.line()`
### Figure 7.2

In [65]:
plot_df.plot.line(
    x="date", 
    y="temp_max"
)

### Figure 7.3

In [66]:
(
    plot_df
    .select("date", "temp_max", "temp_min")
    .melt(
        id_vars=["date"],
        variable_name="metric",
        value_name="value"
    )
    .plot
    .line(
        x="date",
        y="value",
        by="metric"
    )
)

In [67]:
import altair as alt
import polars as pl
from vega_datasets import data

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Built-in Bar chart: `.plot.bar()`

### Figure 7.4

In [68]:
(
    plot_df
    .group_by(
        pl.col("date").dt.week().alias("week")
    )
    .agg(pl.col("temp_max").mean().alias("mean_weekly_temp_max"))
    .sort("week")
    .plot.bar(
        x="week", 
        y="mean_weekly_temp_max"
    )
)

# Creating basic visualizations: line, bar, scatter charts

## Line Chart: Create your first Altair plot

### Figure 7.6

In [69]:
import altair as alt
import polars as pl
from vega_datasets import data

plot_df = pl.from_pandas(data.seattle_weather())

fig = alt.Chart(plot_df).mark_line().encode(
    x="date",
    y="temp_max"
)
fig

In [70]:
raw_df

date,precipitation,temp_max,temp_min,wind,weather
datetime[ns],f64,f64,f64,f64,str
2012-01-01 00:00:00,0.0,12.8,5.0,4.7,"""drizzle"""
2012-01-02 00:00:00,10.9,10.6,2.8,4.5,"""rain"""
2012-01-03 00:00:00,0.8,11.7,7.2,2.3,"""rain"""
2012-01-04 00:00:00,20.3,12.2,5.6,4.7,"""rain"""
2012-01-05 00:00:00,1.3,8.9,2.8,6.1,"""rain"""
2012-01-06 00:00:00,2.5,4.4,2.2,2.2,"""rain"""
2012-01-07 00:00:00,0.0,7.2,2.8,2.3,"""rain"""
2012-01-08 00:00:00,0.0,10.0,2.8,2.0,"""sun"""
2012-01-09 00:00:00,4.3,9.4,5.0,3.4,"""rain"""
2012-01-10 00:00:00,1.0,6.1,0.6,3.4,"""rain"""


In [71]:
plot_df = raw_df

alt.Chart(plot_df).mark_line().encode(
    x="date",
    y="temp_max"
)

In [72]:
print(len(data.list_datasets()))
data.list_datasets()

70


['7zip',
 'airports',
 'annual-precip',
 'anscombe',
 'barley',
 'birdstrikes',
 'budget',
 'budgets',
 'burtin',
 'cars',
 'climate',
 'co2-concentration',
 'countries',
 'crimea',
 'disasters',
 'driving',
 'earthquakes',
 'ffox',
 'flare',
 'flare-dependencies',
 'flights-10k',
 'flights-200k',
 'flights-20k',
 'flights-2k',
 'flights-3m',
 'flights-5k',
 'flights-airport',
 'gapminder',
 'gapminder-health-income',
 'gimp',
 'github',
 'graticule',
 'income',
 'iowa-electricity',
 'iris',
 'jobs',
 'la-riots',
 'londonBoroughs',
 'londonCentroids',
 'londonTubeLines',
 'lookup_groups',
 'lookup_people',
 'miserables',
 'monarchs',
 'movies',
 'normal-2d',
 'obesity',
 'ohlc',
 'points',
 'population',
 'population_engineers_hurricanes',
 'seattle-temps',
 'seattle-weather',
 'sf-temps',
 'sp500',
 'stocks',
 'udistrict',
 'unemployment',
 'unemployment-across-industries',
 'uniform-2d',
 'us-10m',
 'us-employment',
 'us-state-capitals',
 'volcano',
 'weather',
 'weball26',
 'wheat',

### Figure 7.7

In [73]:
import altair as alt
import polars as pl
from vega_datasets import data

plot_df = pl.from_pandas(data.seattle_weather())

fig = alt.Chart(plot_df).mark_line().encode(
    x=alt.X("date").title("Date"),
    y=alt.Y("temp_max").title("Max Temperature (deg Celsius)")
)
fig.properties(
    title=alt.Title(
        "Daily Max Temperatures in Seattle (2012-2015)",
        fontSize=20,
    ),
    width=800,
    height=200
)

## Scatter Plot

### Figure 7.8

In [74]:
fig = (
    alt.Chart(plot_df)
    .mark_circle()
    .encode(
        x=alt.X("date").title("Date"),
        y=alt.Y("temp_max").title("Max Temperature (deg Celsius)"))
    .properties(
        title=alt.Title(
            "Daily Max Temperatures in Seattle (2012-2015)",
            fontSize=20,
        ),
        width=800,
        height=200
    )
)
fig

### Figure 7.9

In [75]:
fig = (
    alt.Chart(plot_df)
    .mark_circle()
    .encode(
        x=alt.X("date").title("Date"),
        y=alt.Y("temp_max").title("Max Temperature (deg Celsius)"),
        color=alt.Color("wind").scale(scheme="purpleorange"),
        size=alt.Size("wind").scale(domain=[0, 30]),
    )
    .properties(
        title=alt.Title(
            "Daily Max Temperatures in Seattle (2012-2015)",
            fontSize=20,
        ),
        width=800,
        height=300
    )
)
fig

### Figure 7.10 - Adding tooltips

In [76]:
fig = (
    alt.Chart(plot_df)
    .mark_circle()
    .encode(
        x=alt.X("date").title("Date"),
        y=alt.Y("temp_max").title("Max Temperature (deg Celsius)"),
        color=alt.Color("wind").scale(scheme="purpleorange"),
        size=alt.Size("wind").scale(domain=[0, 30]),
        tooltip=alt.Tooltip([
            "date",
            "wind",
            "temp_max"
        ])
    )
    .properties(
        title=alt.Title(
            "Daily Max Temperatures in Seattle (2012-2015)",
            fontSize=20,
        ),
        width=800,
        height=300
    )
)
fig

## Bar Chart

### Figure 7.11 - Precalculated monthly mean wind speed

In [77]:
monthly_mean_wind_df = (
    plot_df
    .group_by(pl.col("date").dt.month().alias("month_of_year"))
    .agg(mean_wind = pl.col("wind").mean())
    .sort("month_of_year")
)
monthly_mean_wind_df

month_of_year,mean_wind
i8,f64
1,3.13871
2,3.786726
3,3.579839
4,3.524167
5,3.120161
6,3.130833
7,2.91129
8,2.750806
9,2.963333
10,2.939516


### Figure 7.12 - Monthly Mean Wind Speed (over periods 2012-2015): Bar Chart

In [78]:
monthly_wind_fig = (
    alt.Chart(monthly_mean_wind_df)
    .mark_bar()
    .encode(
        x="month_of_year:O", 
        y="mean_wind"
    )
)
monthly_wind_fig.properties(height=200)

### Figure 7.13 - Monthly Mean Wind Speed: Bar Plot (No Pre-computation)

In [79]:
bar_fig = (
    alt.Chart(plot_df)
    .mark_bar()
    .encode(
        x="month(date)",
        y="mean(wind)"
    )
)
bar_fig

### Figure 7.14

In [80]:
bar_fig = (
    alt.Chart(plot_df)
    .mark_bar()
    .encode(
        x="month(date)",
        y="mean(wind)",
        color="mean(temp_max)",
    )
)
bar_fig

### Figure 7.15 - Exploring transformed data using transformed_data() using Vegafusion add-o

In [81]:
bar_fig.transformed_data()

month_date,month_date_end,mean_wind,mean_temp_max
"datetime[ms, America/New_York]","datetime[ms, America/New_York]",f64,f64
2012-01-01 00:00:00 EST,2012-02-01 00:00:00 EST,3.13871,8.229032
2012-02-01 00:00:00 EST,2012-03-01 00:00:00 EST,3.786726,9.860177
2012-03-01 00:00:00 EST,2012-04-01 00:00:00 EDT,3.579839,12.387097
2012-04-01 00:00:00 EDT,2012-05-01 00:00:00 EDT,3.524167,15.02
2012-05-01 00:00:00 EDT,2012-06-01 00:00:00 EDT,3.120161,19.295968
2012-06-01 00:00:00 EDT,2012-07-01 00:00:00 EDT,3.130833,22.4
2012-07-01 00:00:00 EDT,2012-08-01 00:00:00 EDT,2.91129,25.998387
2012-08-01 00:00:00 EDT,2012-09-01 00:00:00 EDT,2.750806,26.112097
2012-09-01 00:00:00 EDT,2012-10-01 00:00:00 EDT,2.963333,21.924167
2012-10-01 00:00:00 EDT,2012-11-01 00:00:00 EDT,2.939516,16.389516


## Additional Notes: Altair encoding types

### Figure 7.16 – Example of wind vs. temp_max scatter plot colored by quarter of the year with default encoding type, preventing proper analysis

In [82]:
(
    alt.Chart(plot_df)
    .mark_circle()
    .encode(
        x="temp_max",
        y="wind",
        color="quarter(date)",
        opacity=alt.value(0.5)
    )
)

### Figure 7.17 - Wind vs. Max Temperatures, Colored by Quarter of the Year

In [83]:
(
    alt.Chart(plot_df)
    .mark_circle()
    .encode(
        x="temp_max",
        y="wind",
        color="quarter(date):N",
        opacity=alt.value(0.5)
    )
)

# Complex Visualizations

### Figure 7.18

In [84]:
(fig | bar_fig).resolve_scale(color="independent")

### Figure 7.19 - Concatenated scatter plot and bar plot with interactivity

In [85]:
base = alt.Chart(plot_df)
selection = alt.selection_point(encodings=["x"])

bar_fig = (
    base.mark_bar().encode(
        x="month(date)",
        y="mean(wind)",
        color=alt.condition(
            selection,
            alt.Color("mean(temp_max):Q"),
            alt.value("lightgray")
        ),
    )
    .add_params(selection)
)
quarter_fig = (
    base.mark_circle().encode(
        x="temp_max",
        y="wind",
        color=alt.condition(
            selection,
            alt.Color("quarter(date):N"),
            alt.value("lightgray")
        ),
        opacity=alt.value(0.5)
    )
)

(
    (quarter_fig | bar_fig)
    .resolve_scale(color="independent")
)


### Figure 7.20

In [86]:
seattle_heatmap = (
    alt.Chart(plot_df)
    .mark_rect(
        stroke="#f4f4f4",
        cornerRadius=3,
        strokeWidth=3,
    )
    .encode(
        y=alt.Y("weekday:O").title("Day of the Week"),
        x=alt.X("week:O").title("Week"),
        color=alt.Color("temp_max:Q").legend(None).scale(scheme="redyellowblue", reverse=True),
        row=alt.Row("year:N").title(None, fontSize=20),
        tooltip=[
            alt.Tooltip("date", title="Date"),
            alt.Tooltip("temp_min", title="Min Temperature (°C)", format=".1f"),
            alt.Tooltip("temp_max", title="Max Temperature (°C)", format=".1f"),
            alt.Tooltip("week:Q", title="Week")
        ],
    )
    .transform_calculate(
        weekday = "day(datum.date)",
        week = "week(datum.date)",
        year = "year(datum.date)",
    )
    .transform_filter(alt.datum.year==2015)
    .resolve_scale(x="independent", y="independent")
    .properties(
        title=alt.Title(
            "Daily Max Temperatures in Seattle (2015)",
            subtitle=["Source: vega-datasets, seattle-weather", "Year: 2015"],
            subtitleColor="darkgray",
            subtitleFontSize=15,
            fontSize=20,
            color="#4e4e4e",
        )
    )
)
seattle_heatmap

### Figure 7.21 - Faceted Heatmap

In [87]:
seattle_heatmap = (
    alt.Chart(plot_df)
    .mark_rect(
        stroke="#f4f4f4",
        cornerRadius=3,
        strokeWidth=3,
    )
    .encode(
        y=alt.Y("weekday:O").title("Day of the Week"),
        x=alt.X("week:O").title("Week"),
        color=alt.Color("temp_max:Q").legend(None).scale(scheme="redyellowblue", reverse=True),
        row=alt.Row("year:N").title(None, fontSize=20),
        tooltip=[
            alt.Tooltip("date", title="Date"),
            alt.Tooltip("temp_min", title="Min Temperature (°C)", format=".1f"),
            alt.Tooltip("temp_max", title="Max Temperature (°C)", format=".1f"),
            alt.Tooltip("week:Q", title="Week")
        ],
    )
    .transform_calculate(
        weekday = "day(datum.date)",
        week = "week(datum.date)",
        year = "year(datum.date)",
    )
    # .transform_filter(alt.datum.year==2015)
    .resolve_scale(x="independent", y="independent")
    .properties(
        title=alt.Title(
            "Daily Max Temperatures in Seattle (2015)",
            subtitle=["Source: vega-datasets, seattle-weather", "Year: 2015"],
            subtitleColor="darkgray",
            subtitleFontSize=15,
            fontSize=20,
            color="#4e4e4e",
        )
    )
)
seattle_heatmap

# Showcase

## Geomapping

In [88]:
import altair as alt
from vega_datasets import data

states = alt.topo_feature(data.us_10m.url, "states")
counties = alt.topo_feature(data.us_10m.url, "counties")

selected_state = alt.selection_multi(fields=["STATE"])
state_hover = alt.selection_multi(fields=["STATE"], on="mouseover")

# Download from https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html
# Year: "2020", Geography: "United State", Geography Type: "Counties", File Type: "CSV File"
plot_df = (
    pl.read_csv("../datasets/SVI_2020_US_county.csv")
    .drop("AREA_SQMI", "LOCATION", "STCNTY")
    .melt(id_vars=["ST", "STATE", "ST_ABBR", "COUNTY", "FIPS"])
    .with_columns(
        pl.col("ST").cast(pl.UInt8),
        pl.col("value").cast(pl.Float64),
        pl.col("COUNTY").cast(pl.Categorical),
        pl.col("STATE").cast(pl.Categorical),
        (pl.col("COUNTY") + pl.lit(", ") + pl.col("ST_ABBR")).alias("county_label")
    )
    .rename({"FIPS": 'id'}).filter(pl.col("variable")=="MP_POV150")
)

# US All-counties Chloropleth map
us_map = (
    alt.Chart(plot_df)
    .mark_geoshape(strokeOpacity=0.5, strokeWidth=0.5)
    .encode(
        shape="geo:G",
        color=alt.Color("value:Q").legend(orient="top", labelFontSize=10, titleAlign='left').title("Overall US Counties"),
        stroke=alt.Stroke("STATE:N").scale(None),
        opacity=alt.condition(
            state_hover,
            alt.value(1.0),
            alt.value(0.2)
        ),
        tooltip=["value", 'STATE', 'variable', 'ST_ABBR', 'COUNTY'],
    )
    .transform_lookup(
        lookup="id", from_=alt.LookupData(data=counties, key="id"), as_="geo"
    )
    .transform_filter(alt.datum.value!=-999)    # -999 means the count did not report
    .add_params(state_hover)
    .project(type="albersUsa")
    .properties(
        width=500, 
        title=alt.Title(
            ["Social Vulnerability Index (SVI):", "Percent of Residents Below 150% of Federal Poverty Level (FPL)"],
            subtitle=["Metric Name: Below 150% poverty estimate MOE (Mixture of estimates)", 
                      "Metric ID: MP_POV150",
                      "Source: 2020 CDC/ATSDR SVI Data",
            ],
            subtitleColor="darkgray",
            subtitleLineHeight=15,
            fontSize=20,
            color="black",
            anchor="start"
        )
    )
)

us_map



## Interactive dashboard of U.S. States.

Shift-click on left map to select multiple regions.
This is a lengthy code that often 

In [89]:
import altair as alt
from vega_datasets import data

states = alt.topo_feature(data.us_10m.url, "states")
counties = alt.topo_feature(data.us_10m.url, "counties")

selected_state = alt.selection_multi(fields=["STATE"])
state_hover = alt.selection_multi(fields=["STATE"], on="mouseover")

# Download from https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html
# Year: "2020", Geography: "United State", Geography Type: "Counties", File Type: "CSV File"
plot_df = (
    pl.read_csv("../datasets/SVI_2020_US_county.csv")
    .drop("AREA_SQMI", "LOCATION", "STCNTY")
    .melt(id_vars=["ST", "STATE", "ST_ABBR", "COUNTY", "FIPS"])
    .with_columns(
        pl.col("ST").cast(pl.UInt8),
        pl.col("value").cast(pl.Float64),
        pl.col("COUNTY").cast(pl.Categorical),
        pl.col("STATE").cast(pl.Categorical),
        (pl.col("COUNTY") + pl.lit(", ") + pl.col("ST_ABBR")).alias("county_label")
    )
    .rename({"FIPS": 'id'}).filter(pl.col("variable")=="MP_POV150")
)

# US All-counties Chloropleth map
us_map = (
    alt.Chart(plot_df)
    .mark_geoshape(strokeOpacity=0.4, strokeWidth=0.5)
    .encode(
        shape="geo:G",
        color=alt.Color("value:Q").legend(orient="top", labelFontSize=10, titleAlign='left').title("Overall US Counties"),
        stroke=alt.Stroke("STATE:N").scale(None),
        opacity=alt.condition(
            state_hover,
            alt.value(1.0),
            alt.value(0.4)
        ),
        tooltip=["value", 'STATE', 'variable', 'ST_ABBR', 'COUNTY'],
    )
    .transform_lookup(
        lookup="id", from_=alt.LookupData(data=counties, key="id"), as_="geo"
    )
    .transform_filter(alt.datum.value!=-999)
    .add_params(state_hover)
    .project(type="albersUsa")
    .properties(
        width=600, 
        title=alt.Title(
            ["Social Vulnerability Index (SVI):", "Below 150% of Federal Poverty Level (FPL)"],
            subtitle=["Metric Name: Below 150% poverty estimate MOE (Mixture of estimates)", 
                      "Source: 2020 CDC/ATSDR SVI Data",
                      "Metric: MP_POV150", 
                      "",
                      "Click on Region to Load Inset", "Shift+Click For Multiple States"],
            subtitleColor="darkgray",
            subtitleLineHeight=15,
            fontSize=24,
            color="black",
            anchor="start"
        )
    )
)

# State-level Chloropleth map
state_zoom = (
    alt.Chart(plot_df)
    .mark_geoshape(strokeWidth=0.5, stroke="#green")
    .encode(
        shape="geo:G",
        color=alt.Color("value:Q").legend(orient="top", labelFontSize=10, titleAlign='left').title("State-level"),
        tooltip=["value", 'STATE', 'variable', 'ST_ABBR', 'COUNTY'],
    )
    .transform_lookup(
        lookup="id", from_=alt.LookupData(data=counties, key="id"), as_="geo"
    )
    .transform_filter(alt.datum.value!=-999)
    .transform_filter(selected_state)
    .project(type="albersUsa")
    .properties(height=180)
)

# Top 10 Counties with highest Poverty Rates State-level (or multiple-states)
state_bar = (
    alt.Chart(plot_df)
    .mark_bar(strokeWidth=0, stroke="#green")
    .encode(
        x=alt.X("value:Q").title(["Percent of residents below", "150% of Federal Poverty Level"]),
        y=alt.Y("county_label:N").title("County").sort("-x"),
        color=alt.Color("value:Q").legend(orient="top", labelFontSize=10, titleAlign='left').title("State-level"),
        tooltip=["value", 'STATE', 'variable', 'ST_ABBR', 'COUNTY'],
    )
    .transform_filter(alt.datum.value!=-999)
    .transform_filter(selected_state)
    .transform_window(
        rank='rank(value)',
        sort=[alt.SortField('value', order='descending')]
    )
    .transform_filter(
        (alt.datum.rank < 25)
    )
    .add_params(selected_state)
    .properties(height=300)
)

LEFT_PANEL = (
    us_map
    .add_params(selected_state)
)

RIGHT_PANEL = (
    (state_zoom & state_bar)
    .add_params(selected_state)
    .properties(
        title=alt.Title(
            ["Selected State(s)-Level View"],
            subtitle=["Color scale is re-adjusted to State-level"],
            subtitleColor="darkgray",
            subtitleLineHeight=15,
            fontSize=20,
            color="#888888",
            anchor="start"
        )
    )
)

(LEFT_PANEL | RIGHT_PANEL).resolve_scale(color="independent")