### Aim

Create a user session from user history

**Assumptions** - Let one session be defined as one day of user events

In [1]:
import pandas as pd

In [2]:
datapath = '../../datasets/user-items-recsys/'
events_file = 'events.csv'
category_tree = 'category_tree.csv'
item_props_1 = 'item_properties_part1.csv'
item_props_2 = 'item_properties_part2.csv'

In [3]:
df_events = pd.read_csv(datapath+events_file)
df_events.shape

(2756101, 5)

In [4]:
df_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [5]:
## Convert timestamp to date
df_events['datetime'] = pd.to_datetime(df_events['timestamp'],unit='ms')
df_events['date'] = pd.to_datetime(df_events['timestamp'],unit='ms').dt.date

### User Story

Let us try to create a user story based on history

In [35]:
user_id = 992329

In [36]:
df_user = df_events.query('visitorid=='+str(user_id))

In [37]:
print(df_user.shape)
df_user.head()

(30, 7)


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,datetime,date
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
20559,1433224672007,992329,view,193150,,2015-06-02 05:57:52.007,2015-06-02
44215,1433225555976,992329,view,246453,,2015-06-02 06:12:35.976,2015-06-02
50030,1433395158782,992329,view,8775,,2015-06-04 05:19:18.782,2015-06-04
64989,1433395205712,992329,view,8775,,2015-06-04 05:20:05.712,2015-06-04


In [9]:
item1 = 248676
item2 = 193150
item3 = 246453
item4 = 8775

The above user has 5 view events in a span of 3 days. Let us explore the itemid and see if he was searching for similar items

In [10]:
# Load item properties
df_items1 = pd.read_csv(datapath+item_props_1)

In [11]:
## Convert timestamp to date
df_items1['datetime'] = pd.to_datetime(df_items1['timestamp'],unit='ms')
df_items1['date'] = pd.to_datetime(df_items1['timestamp'],unit='ms').dt.date
df_items1.head()

Unnamed: 0,timestamp,itemid,property,value,datetime,date
0,1435460400000,460429,categoryid,1338,2015-06-28 03:00:00,2015-06-28
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-06 03:00:00,2015-09-06
2,1439089200000,395014,400,n552.000 639502 n720.000 424566,2015-08-09 03:00:00,2015-08-09
3,1431226800000,59481,790,n15360.000,2015-05-10 03:00:00,2015-05-10
4,1431831600000,156781,917,828513,2015-05-17 03:00:00,2015-05-17


In [12]:
df_item1 = df_items1.query('itemid=='+str(item1))
df_item2 = df_items1.query('itemid=='+str(item2))
df_item3 = df_items1.query('itemid=='+str(item3))
df_item4 = df_items1.query('itemid=='+str(item4))
print("Shapes of item props:",df_item1.shape, df_item2.shape, df_item3.shape, df_item4.shape)

Shapes of item props: (30, 6) (17, 6) (38, 6) (46, 6)


In [13]:
cat1 = df_item1.query("property=='categoryid'")['value']
cat2 = df_item2.query("property=='categoryid'")['value']
cat3 = df_item3.query("property=='categoryid'")['value']
cat4 = df_item4.query("property=='categoryid'")['value']

In [14]:
print("Category1:",cat1)
print("Category2:",cat2)
print("Category3:",cat3)
print("Category4:",cat4)

Category1: Series([], Name: value, dtype: object)
Category2: 3232518    1662
Name: value, dtype: object
Category3: 6618483    1173
Name: value, dtype: object
Category4: 4204611    1258
Name: value, dtype: object


*We do not have categoryid for item1. Let us explore the other 3 items*

#### First step is to load category heirarchy data into a dataframe

In [15]:
# Load category tree
df_category_tree = pd.read_csv(datapath+category_tree)
df_category_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


#### Create Tree Data structure from category tree

In [23]:
import CategoryTree
import importlib
importlib.reload(CategoryTree)

<module 'CategoryTree' from 'C:\\Users\\Samarth\\Documents\\ml_projects\\rec-items\\RetailRocket\\CategoryTree.py'>

In [24]:
from CategoryTree import CategoryTree

In [25]:
categoryTree = CategoryTree(df_category_tree)

In [26]:
categoryTree.build_trees()

In [27]:
print("Number of independent trees:",len(categoryTree.trees))

Number of independent trees: 25


#### Let us now search for the root node of category: 1662

In [28]:
rootCat2 = categoryTree.get_root_category(cat2.iloc[0])
rootCat3 = categoryTree.get_root_category(cat3.iloc[0])
rootCat4 = categoryTree.get_root_category(cat4.iloc[0])

In [29]:
print("Root Category for {}: {}".format(cat2.iloc[0],rootCat2))
print("Root Category for {}: {}".format(cat3.iloc[0],rootCat3))
print("Root Category for {}: {}".format(cat4.iloc[0],rootCat4))

Root Category for 1662: 679
Root Category for 1173: 140
Root Category for 1258: 140


*We see that Item 3 and Item 3 belong to the same Root Category*

In [31]:
print("Printing Tree with Root Node 679")
categoryTree.print_tree(categoryTree.trees[679],0)

Printing Tree with Root Node 679
Root: 679
	Root: 1424
		Root: 365
		Root: 421
		Root: 1143
		Root: 553
		Root: 1008
			Root: 992
			Root: 202
		Root: 245
			Root: 1520
		Root: 230
		Root: 1105
		Root: 1215
			Root: 1587
			Root: 91
			Root: 1199
			Root: 1461
			Root: 1204
			Root: 1359
			Root: 417
		Root: 281
	Root: 1139
		Root: 310
		Root: 258
		Root: 550
		Root: 449
	Root: 630
		Root: 1217
		Root: 1662
		Root: 752
		Root: 233
		Root: 103
		Root: 1112
	Root: 901
		Root: 1231
		Root: 368
		Root: 833
	Root: 313
		Root: 1436
		Root: 560
		Root: 559
	Root: 1544
	Root: 491
	Root: 869
