In [2]:
import altair as alt

In [3]:
from vega_datasets import data
cars = data.cars()

In [4]:
alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()

In [5]:
import pandas as pd

In [6]:
data = pd.DataFrame({'x': ['A', 'B', 'C', 'D', 'E'],
                     'y': [5, 3, 6, 7, 2]})
data

Unnamed: 0,x,y
0,A,5
1,B,3
2,C,6
3,D,7
4,E,2


In [7]:
alt.Chart(data).mark_bar().encode(
    x='x',
    y='y'
)

In [8]:
data = alt.Data(values=[{'x': 'A', 'y': 5},
                        {'x': 'B', 'y': 3},
                        {'x': 'C', 'y': 6},
                        {'x': 'D', 'y': 7},
                        {'x': 'E', 'y': 2}])
alt.Chart(data).mark_bar().encode(
    x='x:N',  # specify nominal data with :N
    y='y:Q',  # specify quantitative data with :Q
)

In [9]:
import numpy as np
rand = np.random.RandomState(0)

data = pd.DataFrame({'value': rand.randn(100).cumsum()},
                    index=pd.date_range('2018', freq='D', periods=100))
data.head()

Unnamed: 0,value
2018-01-01,1.764052
2018-01-02,2.16421
2018-01-03,3.142948
2018-01-04,5.383841
2018-01-05,7.251399


In [10]:
alt.Chart(data.reset_index()).mark_line().encode(
    x='index:T',
    y='value:Q'
)

Wide-form data is: one row per **independent variable**, metadata recorded in *row* and *column* labels.
Long-form data is: one row per **observation**, metadata recorded within the table as values.

Altair works best with wide-form data. Why?

In [11]:
wide_form = pd.DataFrame({'Date': ['2007-10-01', '2007-11-01', '2007-12-01'],
                          'AAPL': [189.95, 182.22, 198.08],
                          'AMZN': [89.15, 90.56, 92.64],
                          'GOOG': [707.00, 693.00, 691.48]})
print(wide_form)

         Date    AAPL   AMZN    GOOG
0  2007-10-01  189.95  89.15  707.00
1  2007-11-01  182.22  90.56  693.00
2  2007-12-01  198.08  92.64  691.48


In [13]:
long_form = pd.DataFrame({'Date': ['2007-10-01', '2007-11-01', '2007-12-01',
                                   '2007-10-01', '2007-11-01', '2007-12-01',
                                   '2007-10-01', '2007-11-01', '2007-12-01'],
                          'company': ['AAPL', 'AAPL', 'AAPL',
                                      'AMZN', 'AMZN', 'AMZN',
                                      'GOOG', 'GOOG', 'GOOG'],
                          'price': [189.95, 182.22, 198.08,
                                     89.15,  90.56,  92.64,
                                    707.00, 693.00, 691.48]})
print(long_form)

         Date company   price
0  2007-10-01    AAPL  189.95
1  2007-11-01    AAPL  182.22
2  2007-12-01    AAPL  198.08
3  2007-10-01    AMZN   89.15
4  2007-11-01    AMZN   90.56
5  2007-12-01    AMZN   92.64
6  2007-10-01    GOOG  707.00
7  2007-11-01    GOOG  693.00
8  2007-12-01    GOOG  691.48


In [14]:
alt.Chart(long_form).mark_line().encode(
  x='Date:T',
  y='price:Q',
  color='company:N'
)

Use the `melt` function to convert a wide-form dataframe to a long-form dataframe.

In [15]:
wide_form.melt('Date', var_name='company', value_name='price')

Unnamed: 0,Date,company,price
0,2007-10-01,AAPL,189.95
1,2007-11-01,AAPL,182.22
2,2007-12-01,AAPL,198.08
3,2007-10-01,AMZN,89.15
4,2007-11-01,AMZN,90.56
5,2007-12-01,AMZN,92.64
6,2007-10-01,GOOG,707.0
7,2007-11-01,GOOG,693.0
8,2007-12-01,GOOG,691.48


Can convert long-form to wide-form with the `pivot` function.

In [16]:
long_form.pivot(index='Date', columns='company', values='price').reset_index()

company,Date,AAPL,AMZN,GOOG
0,2007-10-01,189.95,89.15,707.0
1,2007-11-01,182.22,90.56,693.0
2,2007-12-01,198.08,92.64,691.48


Can also use altair's `fold` function.

In [17]:
alt.Chart(wide_form).transform_fold(
    ['AAPL', 'AMZN', 'GOOG'],
    as_=['company', 'price']
).mark_line().encode(
    x='Date:T',
    y='price:Q',
    color='company:N'
)