In [None]:
from dask.distributed import Client, progress
client = Client(n_workers=3, threads_per_worker=4)

In [None]:
import pandas as pd
from dask import delayed, compute, visualize
import dask.bag as db
import dask.dataframe as dd

In [None]:
datasets = [
    dict(
        name='df_sl',
        path='../data/input_data/sales_train.csv'
    ),
    dict(
        name='df_it',
        path='../data/input_data/items.csv'
    ),
    dict(
        name='df_ic',
        path='../data/input_data/item_categories.csv'
    )
]

In [None]:
@delayed
def import_data(dataset):
    """
    Imports a csv file as a dask dataframe.
    """
    # Unpack
    name = dataset['name']
    path = dataset['path']
    
    # Execute
    df_out = {}
    df_out[name] = dd.read_csv(path)
        
    return df_out

@delayed
def merge_data(df_list):
    """
    Merges the three datasets in the dd_out object together.
    Delayed.
    """
    
    df_sl = df_list[0]['df_sl']
    df_it = df_list[1]['df_it']
    df_ic = df_list[2]['df_ic']
    
    
    df = df_it.merge(
        right=df_ic,
        left_on='item_category_id',
        right_on='item_category_id',
        how='left'
    )

    df = df_sl.merge(
            right=df,
            left_on='item_id',
            right_on='item_id',
            how='left'
        )
    df['date'] = dd.to_datetime(df['date'])
    
    return df

In [None]:
df_list = list(map(import_data, datasets))
df_out = merge_data(df_list)
visualize(df_out)

In [None]:
%%time
df = compute(df_out)[0].compute()

## Data Descriptions

* ID - an Id that represents a (Shop, Item) tuple within the test set
* shop_id - unique identifier of a shop
* item_id - unique identifier of a product
* item_category_id - unique identifier of item category
* item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
* item_price - current price of an item
* date - date in format dd/mm/yyyy
* date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* item_name - name of item
* shop_name - name of shop
* item_category_name - name of item category

#### Datasets

In [None]:
df_sl.head()

In [None]:
df_it.head()

In [None]:
df_ic.head()

#### Merge the Data

In [None]:
df = pd.merge(
    left=df_it,
    right=df_ic,
    left_on='item_category_id',
    right_on='item_category_id',
    how='left'
)

df = pd.merge(
    left=df_sl,
    right=df,
    left_on='item_id',
    right_on='item_id',
    how='left'
)
df['date'] = pd.to_datetime(df['date'])

In [None]:
df = (
    df.sort_values(by=['item_id', 'item_category_id', 'date'])
    .set_index('date')
)

In [None]:
df.head()