In [75]:
import pandas as pd
import numpy as np
import datetime
import time

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# knowing the data

In [13]:
events_df = pd.read_csv('events.csv')
events_df

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [33]:
events_df.keys()

Index(['timestamp', 'visitorid', 'event', 'itemid', 'transactionid'], dtype='object')

In [12]:
events_df['event'].unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [34]:
events_df[events_df.transactionid.notnull()].visitorid.unique()    

array([ 599528,  121688,  552148, ..., 1155978, 1050575,  855941])

visitorid of thoes visitors who made transactions 

In [29]:
events_df[events_df.transactionid.notnull()].event.unique()

array(['transaction'], dtype=object)

'transaction' is recorded in event, whenever transactionid(purchase) is created

In [37]:
category_df = pd.read_csv('category_tree.csv')

In [38]:
category_df

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0
...,...,...
1664,49,1125.0
1665,1112,630.0
1666,1336,745.0
1667,689,207.0


Categoryid explain the relationship of different products with each other, like categoryid 1016 is a child of parentid 213.

In [40]:
item_properties_1_df = pd.read_csv('item_properties_part1.csv')

In [41]:
item_properties_1_df

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
10999994,1439694000000,86599,categoryid,618
10999995,1435460400000,153032,1066,n1020.000 424566
10999996,1440298800000,421788,888,35975 856003 37346
10999997,1437879600000,159792,400,n552.000 639502 n720.000 424566


timestamp is still the same Unix format.

itemid is the unique item identifier.

Property is the Item's attributes such as category id and availability while the rest are hashed for confidentiality purposes.

Value is the item's property value like availability(one of the properties) is 1 if there is stock and 0 otherwise.

In [43]:
item_properties_1_df.loc[(item_properties_1_df.property == 'categoryid') & (item_properties_1_df.value == '618')].sort_values('timestamp').head()

Unnamed: 0,timestamp,itemid,property,value
8582420,1431226800000,123973,categoryid,618
4818247,1431226800000,413487,categoryid,618
2842000,1431226800000,268139,categoryid,618
9943275,1431226800000,146905,categoryid,618
5769012,1431226800000,451095,categoryid,618


above is the number of items under category id 618.

# Customer Behaviour Exploration
Its good to categorise coustomers in (a) how made transactions, (b) thoes who do not made transaxtion.

In [44]:
# all customers who made transaction.
customer_purchased = events_df[events_df.transactionid.notnull()].visitorid.unique()

In [59]:
len(customer_purchased)

11719

these many visitors made purchases.

and their unique 'visitorid' is stored in customer_purchased

In [48]:
all_customers = events_df.visitorid.unique()

In [49]:
len(all_customers)
#all the visitors of the item.

1407580

In [51]:
customer_browsed = [x for x in all_customers if x not in customer_purchased]  # this code takes time

In [62]:
type(customer_browsed),len(customer_browsed)

(list, 1395861)

In [69]:
customer_browsed = np.isin(all_customers,customer_purchased)  

# another way of doing this.
#customer_browsed = np.array(list(set(all_customers)- set(customer_purchased)))  


# both these codes save time, 

In [70]:
len(customer_browsed)

1407580

these many customers have visited the item, but did not purchaced it.

REMEMBER: CUSTOMER_PURCHASED, CUSTOMER_BROWSED, ALL_CUSTOMERS.  ALL CONTAIN UNIQUE 'visitorid', in numpy array

# Below is a snapshot of visitor id 599528 and their buying journey from viewing to transaction (purchase)

In [71]:
events_df[events_df.visitorid == 599528].sort_values('timestamp')

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
16729,1433221896102,599528,view,356475,
112,1433221941632,599528,addtocart,356475,
130,1433222276276,599528,transaction,356475,4000.0
29505,1433297545235,599528,view,356475,
114163,1433625010602,599528,view,356475,
145498,1433731497480,599528,view,356475,
187497,1433896449376,599528,view,356475,
171995,1433896477794,599528,view,356475,
197272,1433960259198,599528,view,356475,
192471,1433960567121,599528,view,356475,



Now, that we know about customer_purchased lets find out which items they purchased.


In [109]:
purchased_items =[]
for customer in customer_purchased:
    purchased_items.append(list(events_df.loc[(events_df.visitorid == customer) & (events_df.transactionid.notnull())].itemid.unique()))

purchased_items is a list which contain itemid of purchased items.

In [110]:
len(purchased_items)

11719

In [111]:
purchased_items

[[356475],
 [15335,
  380775,
  237753,
  317178,
  12836,
  400969,
  105792,
  25353,
  200793,
  80582,
  302422],
 [81345],
 [150318, 49521],
 [310791, 299044],
 [54058,
  284871,
  251130,
  268335,
  183049,
  261940,
  369093,
  370745,
  192990,
  277119,
  241716,
  283766,
  16417,
  217068,
  36372,
  68923,
  428015,
  69533,
  13520,
  385638,
  442871,
  136526,
  247862,
  93828,
  230911,
  382595,
  34853,
  216260,
  154812,
  445241,
  57702,
  347850,
  151855,
  226327,
  288525,
  51354,
  345994,
  170438,
  254301,
  266439,
  193718,
  388558,
  26745,
  184086,
  79956,
  252040,
  82232,
  309821,
  394518,
  462070,
  331980,
  353111,
  200527,
  235933,
  68532,
  358882,
  60012,
  29741,
  270487,
  163689,
  6913,
  156457,
  341578,
  163352,
  234493,
  135174,
  452481,
  241755,
  56323,
  210137,
  184397,
  285202,
  198690,
  195958,
  239210,
  71640,
  189108,
  369112,
  346186,
  211207,
  134330,
  257070,
  302239,
  459480,
  57577,
  1952

above data is given in the form of list of lists.

The inner lists are collection of itemid purchased by single costumer(possibly on different dates).

This inner list of items can be used to suggest visitors("thoes who buy this also buy following"). Provided the visitor buy any one item from this inner list.

In [112]:
# As purchase is made 'itemid' is pass through this function along with purchased_items(which is calculated above)
def recommender_bought_bought(item_id, purchased_items):
    recommender_list = []
    for x in purchased_items:        # x(inner_list) is a purchased item
        if item_id in x:             # purchased item is in x(inner_list)
            recommender_list += x
    recommender_list = list(set(recommender_list) - set([item_id]))
    
    return recommender_list

In [113]:
# Check: recommender_bought_bought()
recommender_bought_bought(302422, purchased_items)

[105792, 200793, 12836, 80582, 380775, 15335, 400969, 25353, 237753, 317178]

So now we can present to the visitor a list of the other items a customer previously bought along with what item the current visitor is viewing e.g. item number 302422