In [29]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, date

In [2]:
# buys_data = pd.read_csv('yoochoose-data/yoochoose-buys.dat', header=None)
# clicks_data = pd.read_csv('yoochoose-data/yoochoose-clicks.dat', header=None)

In [3]:
# buys_data.columns = ['session', 'timestamp', 'item', 'price', 'quantity']
# clicks_data.columns = ['session', 'timestamp', 'item', 'category']

#### CLICKS DATASET FILE DESCRIPTION

The file yoochoose-clicks.dat comprising the clicks of the users over the items.
Each record/line in the file has the following fields/format: Session ID, Timestamp, Item ID, Category
- Session ID – the id of the session. In one session there are one or many clicks. Could be represented as an integer number.
- Timestamp – the time when the click occurred. Format of YYYY-MM-DDThh:mm:ss.SSSZ
- Item ID – the unique identifier of the item that has been clicked. Could be represented as an integer number.
- Category – the context of the click. The value "S" indicates a special offer, "0" indicates  a missing value, a number between 1 to 12 indicates a real category identifier, any other number indicates a brand.
 
 E.g. if an item has been clicked in the context of a promotion or special offer then the value will be "S", if the context was a brand i.e BOSCH,
 then the value will be an 8-10 digits number. If the item has been clicked under regular category, i.e. sport, then the value will be a number between 1 to 12. 
 
#### BUYS DATSET FILE DESCRIPTION

The file yoochoose-buys.dat comprising the buy events of the users over the items.
Each record/line in the file has the following fields: Session ID, Timestamp, Item ID, Price, Quantity

- Session ID - the id of the session. In one session there are one or many buying events. Could be represented as an integer number.
- Timestamp - the time when the buy occurred. Format of YYYY-MM-DDThh:mm:ss.SSSZ
- Item ID – the unique identifier of item that has been bought. Could be represented as an integer number.
- Price – the price of the item. Could be represented as an integer number.
- Quantity – the quantity in this buying.  Could be represented as an integer number.

In [4]:
# diff = buys_data['session'].unique()

In [5]:
# new_clicks = clicks_data.loc[clicks_data['session'].isin(diff)]

In [6]:
# new_clicks.to_pickle('bought.pickle')

In [7]:
# not_bought = clicks_data.loc[clicks_data['session'].isin(diff) == False]

In [8]:
# not_bought.to_pickle('not_bought.pickle')

In [9]:
# not_bought = pd.read_pickle('not_bought.pickle')

### -------------------------- *Sampling* --------------------------

In [10]:
# rdiff = random.sample(list(diff), 100000)

In [11]:
# buys_sample = buys_data[buys_data['session'].isin(rdiff)]
# click_bought_sample = new_clicks[new_clicks['session'].isin(rdiff)]
# click_not_bought_sample = not_bought[:500000]

In [12]:
# buys_sample.to_pickle('sample-buys.pickle')
# click_bought_sample.to_pickle('sample-bought.pickle')
# click_not_bought_sample.to_pickle('sample-not-bought.pickle')

In [13]:
buys = pd.read_pickle('sample-buys.pickle')
click_bought = pd.read_pickle('sample-bought.pickle')
click_not_bought = pd.read_pickle('sample-not-bought.pickle')

In [14]:
time_format = '%H:%M:%S'

In [15]:
def read_time_stamp(time):
    date,time = time.split('T')
    yy, mm, dd = date.split('-')
    time = time[:-1]
    h, m, s = time.split(':')
    return {'dd':dd,
            'mm':mm,
            'yy':yy,
            'h':h,
            'm':m,
            's':s.split('.')[0],
            'date':date,
            'time':time.split('.')[0]
           }

In [16]:
def time_diff_secs(start, end):
    tdelta = datetime.strptime(end, time_format) - datetime.strptime(start, time_format)
    return tdelta.seconds

In [17]:
b_sessions = sorted(list(buys['session'].unique())) # all bought sessions

In [51]:
session_data = []

In [52]:
for bb in b_sessions[:100]:
    bs = buys[buys['session']==bb]
    cbs = click_bought[click_bought['session']==bb]
    time = []
    for tt in cbs['timestamp']:
        dct = read_time_stamp(tt)
        dow = date(int(dct['yy']),int(dct['mm']),int(dct['dd'])).weekday()
        mm = int(dct['mm'])
        time.append(dct['time'])
    time = sorted(time)
    start,end = time[0],time[-1]
    td = time_diff_secs(start,end)
    #average time
    avg = float("{0:.2f}".format(td/len(time),2))
    n_clicks = len(time)
    try:
        max_time = max([time_diff_secs(time[i-1], time[i]) for i in range(1,len(time))])
    except:
        max_time = 0
    noc = len(cbs['category'].unique())
    session_data.append([bb, td, avg, max_time, n_clicks, dow, mm, noc])

In [53]:
df = pd.DataFrame(session_data)

In [54]:
df.columns = ['session', 'total_time', 'avg.time_clicks', 'max_time',
              'n_clicks', 'day_of_week', 'month', 'no_of_categories']

In [55]:
df

Unnamed: 0,session,total_time,avg.time_clicks,max_time,n_clicks,day_of_week,month,no_of_categories
0,216,47,23.5,47,2,2,4,1
1,293,295,32.78,145,9,5,4,1
2,324,1251,96.23,288,13,4,4,1
3,408,7,3.5,7,2,2,4,1
4,449,373,31.08,81,12,1,4,1
5,636,9,4.5,9,2,0,4,1
6,651,2733,97.61,428,28,1,4,1
7,664,802,100.25,483,8,4,4,1
8,692,50,25.0,50,2,3,4,1
9,728,10,5.0,10,2,0,4,1
