# Migrate Vega datasets to Ibis.

In [1]:
import vega_datasets

In [2]:
import ibis.mapd
from ibis.pandas.client import  infer_pandas_schema
from pandas import Series, Index, DataFrame

In [3]:
def try_to_load(dataset):
    """Some datasets will not load."""
    try: return dataset()
    except: return DataFrame()

In [4]:
def load_vega_datasets(limit=0):
    return {
        object: try_to_load(getattr(vega_datasets.data, object)) 
        for i, object in enumerate(dir(vega_datasets.data))
        if not limit or i < limit
    }

In [5]:
MAPPING = {
    'year': 'year_', 'Year': 'year_', 'value': 'value_', 
    'DATE': 'date_', 'Date': 'date_', 'date': 'date_',
    'time': 'time_', 'count': 'count_', "group": "group_",
    'end': 'end_', "start": "start_", "key": "key_", "id": "id_",
    'month': 'month_'
}

In [22]:
from importnb import Notebook

with Notebook():
    try:
        from . import truncate_tweets
    except:
        import truncate_tweets

In [18]:
def preprocess_df(df):
    global MAPPING
    df.columns = df.columns.astype(str).str.replace(' ', '_').str.replace('-', '_').str.replace('"', '')

    df = df.rename(columns=MAPPING)
    
    if 'Title' in df.columns:
        """In movies the integers mess things up"""
        df['Title'] = df['Title'].astype(str)

    """Rename the columns"""
    df = df[df.columns[list(df.columns.astype(str).map(lambda x: x[0].isalpha()))]]
    return df
    

In [19]:
def load_data(client, table_name, df): 
    df = df.pipe(preprocess_df)
    client.create_table(table_name, schema=infer_pandas_schema(df))    
    client.load_data(table_name, df, method='columnar', create=False)

In [20]:
def migrate(client, **data):
    global MAPPING
    for key, value in data.items():
        if len(value):
            try: client.drop_table(key)
            except: ...
            
            if not isinstance(value, DataFrame) or not len(value.columns): 
                """Skip empty frames"""
                continue
            
            if key in ('unemployment', 'weather', 'movies'):
                """Explicitly ignore these datasets."""
                continue

            load_data(client, key, value)
    return client

# Other VegaLite datasets

In [21]:
from altair.vegalite.v2.examples import simple_bar_chart

extras = {
    "simple_bar_chart": simple_bar_chart.data
}

# MapD

In [15]:
default = dict(
    host='192.168.99.100', user='mapd', password='HyperInteractive',
    port=32769, database='mapd')

def mapd(**host):
    return ibis.mapd.connect(**(host or default))

def build_mapd():    
    with mapd(**host) as mapd:
        mapd.load_data = mapd.con.load_table
        migrate(mapd, **extras)
        migrate(mapd, tweets=truncate_tweets.sample(1000))
        migrate(mapd, **load_vega_datasets())
    return host

# Pandas

In [27]:
def build_pandas():
    pandas = ibis.pandas.connect({})
    pandas = migrate(pandas, **extras, **load_vega_datasets())
    pandas = migrate(pandas, tweets=truncate_tweets.sample(1000)[-1])
    return pandas