In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, date

In [3]:
buys = pd.read_csv('../yoochoose-data/yoochoose-buys.dat', header=None)

In [4]:
buys.columns = ['session', 'timestamp', 'item', 'price', 'quantity']

In [6]:
buys.head(10)

Unnamed: 0,session,timestamp,item,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1
5,140806,2014-04-07T09:22:28.132Z,214668193,523,1
6,140806,2014-04-07T09:22:28.176Z,214587399,1046,1
7,140806,2014-04-07T09:22:28.219Z,214586690,837,1
8,140806,2014-04-07T09:22:28.268Z,214774667,1151,1
9,140806,2014-04-07T09:22:28.280Z,214578823,1046,1


In [8]:
click_bought = pd.read_csv('../bought.csv', dtype = { 'session':int,
                                                'timestamp':str,
                                                'item':int,
                                                'category':str}
                )

In [10]:
click_bought.head(10)

Unnamed: 0,session,timestamp,item,category
0,11,2014-04-03T10:44:35.672Z,214821275,0
1,11,2014-04-03T10:45:01.674Z,214821275,0
2,11,2014-04-03T10:45:29.873Z,214821371,0
3,11,2014-04-03T10:46:12.162Z,214821371,0
4,11,2014-04-03T10:46:57.355Z,214821371,0
5,11,2014-04-03T10:53:22.572Z,214717089,0
6,11,2014-04-03T10:53:49.875Z,214563337,0
7,11,2014-04-03T10:55:19.267Z,214706462,0
8,11,2014-04-03T10:55:47.327Z,214717436,0
9,11,2014-04-03T10:56:30.520Z,214743335,0


In [5]:
time_format = '%H:%M:%S'

In [6]:
def read_time_stamp(time):
    date,time = time.split('T')
    yy, mm, dd = date.split('-')
    time = time[:-1]
    h, m, s = time.split(':')
    return {'dd':dd,
            'mm':mm,
            'yy':yy,
            'h':h,
            'm':m,
            's':s.split('.')[0],
            'date':date,
            'time':time.split('.')[0]
           }

In [7]:
def time_diff_secs(start, end):
    tdelta = datetime.strptime(end, time_format) - datetime.strptime(start, time_format)
    return tdelta.seconds

In [8]:
b_sessions = sorted(list(buys['session'].unique())) # all bought sessions
items = buys['item']
click_items = click_bought['item']
q_vals = buys[['item','quantity']].values
s_items = buys[['session','item']].values

In [9]:
clicks = {}
quantity = {}
buy_count = {}
popularity = {}
session_item = {}

In [10]:
for session in b_sessions:
    session_item[session] = []

In [11]:
for si in s_items:
    session_item[si[0]].append(si[1])

In [12]:
for item in items:
    clicks[item]=0
    quantity[item]=0
    buy_count[item]=0

In [13]:
for item in click_items:
    try:
        clicks[item]+=1
    except:
        pass

In [14]:
for ll in q_vals:
    quantity[ll[0]]+=ll[1]
    buy_count[ll[0]]+=1

In [15]:
for item in items:
    try:
        popularity[item]=float("{0:.2f}".format(buy_count[item]/clicks[item],2))
    except:
        popularity[item]=0.00

In [16]:
item_data = []

In [17]:
for item in items:
    total_clicks = clicks[item]
    quant = quantity[item]
    buy = buy_count[item]
    pop = popularity[item]
    item_data.append([item, total_clicks, quant, buy, pop])

In [18]:
item_df = pd.DataFrame(item_data)

In [19]:
item_df.columns = ['item', 'total_clicks', 'quantity_sold', 'buys', 'popularity']

### click sessions in which items were bought

In [20]:
session_data = []
time_bias = 2
for bb in b_sessions:
    # bb = b_sessions[100]
    bs = buys[buys['session']==bb] # buys DF in a particular session
    cbs = click_bought[click_bought['session']==bb] # clicks DF in a particular session
    its = set(cbs['item']) # items clicked in a particular session
    b_its = set(bs['item']) # items bought in a particular sessions

    # ==========overall session feature extraction==========
    # number of clicks
    n_clicks = len(cbs)
    time = []
    date_var = []
    # time spent on each item in this session
    it_time = {}
    for it in its:
        it_time[it] = 0
    it_time[cbs.iloc[0]['item']]=time_bias
    prev = read_time_stamp(cbs.iloc[0]['timestamp'])['time']
    max_time = 0
    td = 0
    for ind in range(1,len(cbs)):
        now = read_time_stamp(cbs.iloc[ind]['timestamp'])['time']
        tdd = time_diff_secs(prev, now)
        it_time[cbs.iloc[ind]['item']]+= tdd
        td += tdd
        if tdd > max_time:
        # max time spent in this session
            max_time = tdd
        prev = now
    #average time
    avg = float("{0:.2f}".format(td/n_clicks,2))

    # number of unique categories in session
    noc = len(cbs['category'].unique())
    # avg. pop score
    sum_pop = 0
    for it in its:
        # sum of pop_scores
        try:
            sum_pop += popularity[it]
        except:
            pass
    avg_pop_score = sum_pop/len(its)
    # number of unique items in this session
    n_unique_items = len(its)
    # ==========session-item feature extraction==========
    for it in its:
        it_cbs = cbs[cbs['item']==it]
        # day of week of first click in this session
        ts = it_cbs.iloc[0]['timestamp']
        rdt = read_time_stamp(ts)
        f_c_time = rdt['time'] # time of first click in this session
        f_dw = date(int(rdt['yy']),int(rdt['mm']),int(rdt['dd'])).weekday()
        # day of week of last click in this session
        ts = it_cbs.iloc[-1]['timestamp']
        rdt = read_time_stamp(ts)
        l_c_time = rdt['time'] # time of last click in this session
        l_dw = date(int(rdt['yy']),int(rdt['mm']),int(rdt['dd'])).weekday()
        # Number of clicks in this item
        n_clicks_item = len(it_cbs)
        # duration between first click and last click of this item in this session
        dur_f_l = time_diff_secs(f_c_time, l_c_time)
        if dur_f_l == 0:
            dur_f_l = 2
        # pop_score
        try:
            it_pop = popularity[it]
        except:
            it_pop = 0
        # if this item is the first click
        if cbs.iloc[0]['item'] == it:
            f_click = 1
        else:
            f_click = 0
        # if this item is the last click
        if cbs.iloc[-1]['item'] == it:
            l_click = 1
        else:
            l_click = 0
        # purchased or not
        if it in b_its:
            purchased = 1
        else:
            purchased = 0
        session_data.append([bb,it, td, avg, max_time, n_clicks, avg_pop_score, noc,
              it_time[it], f_dw, l_dw, n_clicks_item, dur_f_l, f_click, l_click, it_pop,
              purchased])

session_df = pd.DataFrame(session_data)
session_df.columns = ['session', 'item', 'total_time', 'avg.time_clicks', 'max_time',
                      'n_clicks', 'avg_pop_score', 'no_of_categories',
                      'item_time', 'dow_first', 'dow_last','item_clicks', 'duration_f_l',
                      'f_click', 'l_click', 'item_pop',
                      'purchased']

In [21]:
session_df.head(20)

Unnamed: 0,session,item,total_time,avg.time_clicks,max_time,n_clicks,avg_pop_score,no_of_categories,item_time,dow_first,dow_last,item_clicks,duration_f_l,f_click,l_click,item_pop,purchased
0,11,214717089,784,65.33,385,12,0.157778,1,385,3,3,1,2,0,0,0.27,0
1,11,214743335,784,65.33,385,12,0.157778,1,43,3,3,1,2,0,0,0.07,0
2,11,214563337,784,65.33,385,12,0.157778,1,27,3,3,1,2,0,0,0.02,0
3,11,214819762,784,65.33,385,12,0.157778,1,20,3,3,1,2,0,1,0.13,0
4,11,214826837,784,65.33,385,12,0.157778,1,49,3,3,1,2,0,0,0.26,0
5,11,214821371,784,65.33,385,12,0.157778,1,116,3,3,3,88,0,0,0.24,1
6,11,214821275,784,65.33,385,12,0.157778,1,28,3,3,2,26,1,0,0.17,0
7,11,214717436,784,65.33,385,12,0.157778,1,28,3,3,1,2,0,0,0.19,0
8,11,214706462,784,65.33,385,12,0.157778,1,90,3,3,1,2,0,0,0.07,0
9,12,214717867,179,89.5,179,2,0.22,1,181,2,2,2,179,1,1,0.22,1


In [22]:
session_df.to_csv('final-bought-dataset.csv', index=False)

In [23]:
len(session_df)

2314687