### What is Altair

Altair is a declarative statistical visualization library for Python, based on Vega and Vega-Lite, which are both visualization grammar that allows you to describe the visual appearance and interactive behavior of a visualization in a JSON format.


In [9]:
import numpy as np
import pandas as pd
import altair as alt
from pandas_datareader import data

In [18]:
df = pd.read_csv('world_population 2020.csv')
df.columns

Index(['Country', 'Population', 'Yearly_Change', 'Net_Change',
       'Density(P/Km²)', 'Land_Area(Km²)', 'Migrants(net)', 'Fertility_Rate',
       'Median_Age', 'Urban _Pop %', 'World_Share', 'Continent'],
      dtype='object')

In [36]:
alt.Chart(df).mark_bar().encode(x='Population', y='Continent', color='Continent').properties(height=300, width=400)

# another method
# alt.Chart(df, height=400, width=400).mark_bar().encode(alt.X('Population'),alt.Y('Continent'))

In [46]:
# we can use vega-datasets to use built dataset

from vega_datasets import data
cars = data.cars()

# chart time
alt.Chart(cars).mark_point().encode(x='Horsepower', y='Miles_per_Gallon', color='Origin')

#### Long-form vs Wide-form Data

There are two common conventions for storing data in a dataframe, sometimes called 'long-form and wide-form'. Both are sensible patterns for storing data in a tabular format; briefly, the difference is this:

* wide-form data has one row per independent variable, with metadata recorded in the row and column labels.

* long-form data has one row per observation, with metadata recorded within the table as values.

Altair’s grammar works best with long-form data, in which each row corresponds to a single observation along with its metadata.

In [47]:
wide_form = pd.DataFrame({'Date': ['2007-10-01', '2007-11-01', '2007-12-01'],
                          'AAPL': [189.95, 182.22, 198.08],
                          'AMZN': [89.15, 90.56, 92.64],
                          'GOOG': [707.00, 693.00, 691.48]})
print(wide_form)

         Date    AAPL   AMZN    GOOG
0  2007-10-01  189.95  89.15  707.00
1  2007-11-01  182.22  90.56  693.00
2  2007-12-01  198.08  92.64  691.48


In [48]:
# here time is an independent variable

In [49]:
long_form = pd.DataFrame({'Date': ['2007-10-01', '2007-11-01', '2007-12-01',
                                   '2007-10-01', '2007-11-01', '2007-12-01',
                                   '2007-10-01', '2007-11-01', '2007-12-01'],
                          'company': ['AAPL', 'AAPL', 'AAPL',
                                      'AMZN', 'AMZN', 'AMZN',
                                      'GOOG', 'GOOG', 'GOOG'],
                          'price': [189.95, 182.22, 198.08,
                                     89.15,  90.56,  92.64,
                                    707.00, 693.00, 691.48]})
print(long_form)

         Date company   price
0  2007-10-01    AAPL  189.95
1  2007-11-01    AAPL  182.22
2  2007-12-01    AAPL  198.08
3  2007-10-01    AMZN   89.15
4  2007-11-01    AMZN   90.56
5  2007-12-01    AMZN   92.64
6  2007-10-01    GOOG  707.00
7  2007-11-01    GOOG  693.00
8  2007-12-01    GOOG  691.48


Notice here that each row contains a single observation (i.e. price), along with the metadata for this observation (the date and company name). Importantly, the column and index labels no longer contain any useful metadata.

As mentioned above, Altair works best with this long-form data, because relevant data and metadata are stored within the table itself, rather than within the labels of rows and columns.

In [50]:
alt.Chart(long_form).mark_line().encode(
  x='Date:T',
  y='price:Q',
  color='company:N'
)

In [51]:
# From wide to long-form

wide_form.melt("Date", var_name='company', value_name='price')

Unnamed: 0,Date,company,price
0,2007-10-01,AAPL,189.95
1,2007-11-01,AAPL,182.22
2,2007-12-01,AAPL,198.08
3,2007-10-01,AMZN,89.15
4,2007-11-01,AMZN,90.56
5,2007-12-01,AMZN,92.64
6,2007-10-01,GOOG,707.0
7,2007-11-01,GOOG,693.0
8,2007-12-01,GOOG,691.48


In [52]:
# from long to wide-form

long_form.pivot(index='Date', columns='company', values='price')

company,AAPL,AMZN,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-10-01,189.95,89.15,707.0
2007-11-01,182.22,90.56,693.0
2007-12-01,198.08,92.64,691.48


In [53]:
# Converting Between Long-form and Wide-form: Fold Transform

alt.Chart(wide_form).transform_fold(
    ['AAPL', 'AMZN', 'GOOG'],
    as_=['company', 'price']
).mark_line().encode(
    x='Date:T',
    y='price:Q',
    color='company:N'
)
