In [3]:
import pandas as pd
import numpy as np

import datetime 
import time

%matplotlib inline
import matplotlib.pyplot as plt

In [4]:
events_df = pd.read_csv('../data/events.csv')
category_tree_df = pd.read_csv('../data/category_tree.csv')
item_properties_1_df = pd.read_csv('../data/item_properties_part1.csv')
item_properties_2_df = pd.read_csv('../data/item_properties_part2.csv')

In [5]:
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [6]:
events_df.shape[0], events_df.itemid.nunique()

(2756101, 235061)

Our first problem is that we have a large amount of unique visitors and unique items. This means that the search space the agent needs to traverse becomes massive. Basically `items x visitors`.

One tactic for dealing with this, is to used unsupervised clustering to cluster together items and users in to a manageable number of groups and categories. 

We need two clustering mechanisms: One that deals with events and another that deals with items.

In [32]:
user_df = retail_df.drop(['itemid'], axis=1)

In [33]:
user_df.head(n=3)

Unnamed: 0,timestamp,visitorid,event
0,1433221332117,257597,view
1,1433224214164,992329,view
2,1433221999827,111016,view


In [None]:
def days_hours_minutes(td):
    return td.days, td.seconds//3600, (td.seconds//60)%60

In [53]:
def gen_user_and_item_dataframes(events_df):
    
    retail_df = events_df[['timestamp',
                      'visitorid',
                      'event',
                      'itemid']]
    
    user_df = retail_df.drop(['itemid'], axis=1)
    item_df = retail_df.drop(['visitorid'], axis=1)
    
    dfs = {'user_df': user_df, 
           'item_df': item_df}

    for df in dfs.keys():

        if df == 'user_df':
            _id = 'visitorid'
            
        if df == 'item_df':
            _id = 'itemid'
        
        dfs[df] =  pd.get_dummies(dfs[df], columns=['event'])
        dfs[df]['no_visits'] = dfs[df].groupby(_id).cumcount() + 1
        dfs[df]['no_views'] = dfs[df].groupby([_id, 'event_view'])['event_view'].cumsum()
        dfs[df]['no_addtocart'] = dfs[df].groupby([_id, 'event_addtocart'])['event_addtocart'].cumsum()
        dfs[df]['no_transactions'] = dfs[df].groupby([_id, 'event_transaction'])['event_transaction'].cumsum()
        
    frequent_visitors = dfs['user_df'].groupby(['visitorid']).no_visits.count()[lambda x: x > 5]
    freq_user_df = dfs['user_df'].loc[dfs['user_df']['visitorid'].isin(frequent_visitors.index)]
    
    frequent_items = dfs['item_df'].groupby(['itemid']).no_visits.count()[lambda x: x > 5]
    freq_item_df = dfs['item_df'].loc[dfs['item_df']['itemid'].isin(frequent_visitors.index)]
    
    d0 = datetime.datetime(2015, 1, 1, 0, 0, 0, 0)
    
    freq_user_df['days'] = [days_hours_minutes(d0 - datetime.datetime.fromtimestamp(x / 1e3))[0] for x in list(freq_user_df.timestamp)]
    freq_user_df['hours'] = [days_hours_minutes(d0 - datetime.datetime.fromtimestamp(x / 1e3))[1] for x in list(freq_user_df.timestamp)]
    freq_user_df['mins'] = [days_hours_minutes(d0 - datetime.datetime.fromtimestamp(x / 1e3))[2] for x in list(freq_user_df.timestamp)]
    freq_user_df = freq_user_df.drop(['timestamp'], axis=1)
    
    freq_item_df['days'] = [days_hours_minutes(d0 - datetime.datetime.fromtimestamp(x / 1e3))[0] for x in list(freq_item_df.timestamp)]
    freq_item_df['hours'] = [days_hours_minutes(d0 - datetime.datetime.fromtimestamp(x / 1e3))[1] for x in list(freq_item_df.timestamp)]
    freq_item_df['mins'] = [days_hours_minutes(d0 - datetime.datetime.fromtimestamp(x / 1e3))[2] for x in list(freq_item_df.timestamp)]
    freq_item_df = freq_item_df.drop(['timestamp'], axis=1)

        
    return dfs, freq_user_df, freq_item_df

In [54]:
dfs, freq_user_df, freq_item_df = gen_user_and_item_dataframes(events_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

In [62]:
if freq_user_df.itemid.any():
    print("Yes")

AttributeError: 'DataFrame' object has no attribute 'itemid'

In [57]:
len(freq_user_df), freq_user_df.visitorid.nunique()

(833702, 58653)

In [59]:
len(freq_item_df), freq_item_df.itemid.nunique()

(120148, 9834)