In [1]:
import datetime
import matplotlib
import numpy as np
import pandas as pd
import pyreadr

import os

### raw data preprocessing

##### dataset 1 only

In [2]:
data_raw = pyreadr.read_r('../../data/sequences_all_anon.Rds')[None]

In [35]:
data_raw.rename(columns={'datum':'date', 'value':'category', 'anon_apps.name':'app_name'}, inplace=True)
data_raw['timestamp'] = data_raw['date'].apply(lambda x: x.timestamp())
data_raw.loc[data_raw['app_name'].isnull(),'app_name'] = data_raw['category'] # replace NaNs in app_name by corresponding category value
data_raw['sessionID'] = data_raw['app_name'].shift(1).isin(['OFF_LOCKED','OFF_UNLOCKED']).cumsum() + 1 # sessionID is like sequence_number but does NOT start anew for each user

In [36]:
data_raw.to_csv('../../data/data_raw.csv', index=False)

##### datasets 1, 2 and 3 combined

In [178]:
# data_raw_1 = pyreadr.read_r('../../data/sequences_all_anon.Rds')[None]
# data_raw_2 = pyreadr.read_r('../../data/sequences_all_anon_1.Rds')[None]
# data_raw_3 = pyreadr.read_r('../../data/sequences_all_anon_wild.Rds')[None]

In [179]:
# print(len(data_raw_1))
# print(len(data_raw_2))
# print(len(data_raw_3))

In [180]:
# print(len(data_raw_1)/(len(data_raw_1)+len(data_raw_2)+len(data_raw_3)))

In [181]:
# data_raw = pd.concat([data_raw_1, data_raw_2, data_raw_3], ignore_index=True)

In [182]:
# print(data_raw_1.datum.min())
# print(data_raw_1.datum.max())
# print(data_raw_2.datum.min())
# print(data_raw_2.datum.max())
# print(data_raw_3.datum.min())
# print(data_raw_3.datum.max())

### session-aware data preprocessing

In [34]:
data_sa = pd.read_csv('../../data/data_raw.csv')

In [35]:
app_mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data_sa['app_name'])))])
app_indexes = [app_mapping[x] for x in data_sa['app_name']]
# print(len(set(app_indexes)) == data_sa['app_name'].nunique()) # check

user_mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(data_sa['userId'])))])
user_indexes = [user_mapping[x] for x in data_sa['userId']]
# print(len(set(user_indexes)) == data_sa['userId'].nunique()) # check

data_sa['appID'] = app_indexes
data_sa.insert(0, 'userID', user_indexes)

In [None]:
app_mapping_reverse = dict((v,k) for k,v in app_mapping.items())
user_mapping_reverse = dict((v,k) for k,v in user_mapping.items())

In [36]:
print(list(app_mapping.keys())[list(app_mapping.values()).index(1194)])
print(app_mapping_reverse[1194])

Messaging_1


In [39]:
data_sa = data_sa.drop(['userId', 'date', 'activity', 'category', 'sequence_number', 'app_name'], axis=1)

In [44]:
data_sa.to_csv('../../data/data_sa.csv', index=False)

### Train-validation-test split

##### setup

In [2]:
data_sa = pd.read_csv('../../data/data_sa.csv')

In [3]:
USER_KEY = 'userID'
ITEM_KEY = 'appID'
TIME_KEY = 'timestamp'
SESSION_KEY = 'sessionID'

In [4]:
def preprocess(df, min_item_support=5, min_session_length=2, min_user_sessions=3,
               drop_on=False, drop_off=False, drop_first=False):
    '''
    Preprocesses the dataframe by filtering out infrequent items, short sessions, and users with few sessions
    -----
        df: Pandas dataframe
            Must contain the following columns: USER_KEY; ITEM_KEY; TIME_KEY; SESSION_KEY
        drop_first: boolean
            whether the first item of each session should be dropped
        min_item_support: integer
            minimum number of occurrences of an item (app) across all users and sessions for an item to be included
        min_session_length: integer
            minimum length (number of items) of a session for a session to be included
        min_user_sessions: integer
            minimum number of sessions per user for a user to be included
    '''
    if drop_first:
        mask = df[ITEM_KEY].shift(-1).isin([1389, 1390]) # 1389="OFF_LOCKED", 1390="OFF_UNLOCKED"
        df = df[~mask] # filter out the first item of each session, i.e., items PRECEDED by 1389 or 1390
    if drop_on:
        mask = df[ITEM_KEY].isin([1392, 1393])
        df = df[~mask]
    if drop_off:
        mask = df[ITEM_KEY].isin([1389, 1390])
        df = df[~mask]
    # min_item_support
    df = df.groupby(ITEM_KEY).filter(lambda x: len(x) >= min_item_support)
    # min_session_length
    if df.groupby(SESSION_KEY)[SESSION_KEY].size().min() < min_session_length:
        df = df.groupby(SESSION_KEY).filter(lambda x: len(x) >= min_session_length)
    # min_user_sessions
    user_sessions = df.groupby([USER_KEY])[SESSION_KEY].nunique()
    mask = df[USER_KEY].apply(lambda x: user_sessions[x]) >= min_user_sessions
    df = df[mask]
    
    return df

In [5]:
def split_last_session(df):
    '''
    Splits off the last session of a sequence of sessions for each user
    -----
        df: Pandas dataframe
            Must contain the following columns: USER_KEY; ITEM_KEY; TIME_KEY; SESSION_KEY
    '''
    last_sessions = df[SESSION_KEY].groupby(df[USER_KEY]).transform('last')
    train = df[df[SESSION_KEY]!=last_sessions]
    test = df[df[SESSION_KEY]==last_sessions]
    
    return (train, test)

In [6]:
def filter_new_items(train, test):
    '''
    Filters out observations from a test set which do not appear in the corresponding training set
    -----
        train: Pandas dataframe
            Training set; must contain the following columns: USER_KEY; ITEM_KEY; TIME_KEY; SESSION_KEY
        test: Pandas dataframe
            Test set; must contain the following columns: USER_KEY; ITEM_KEY; TIME_KEY; SESSION_KEY
    '''
    test = test[test[ITEM_KEY].isin(train[ITEM_KEY].unique())]
    return test

In [7]:
def split_data(df,
               min_item_support, min_session_length, min_user_sessions,
               USER_KEY, ITEM_KEY, TIME_KEY, SESSION_KEY,
               drop_on=False, drop_off=False, drop_first=False):
    df_preprocessed = preprocess(df, min_item_support=5, min_session_length=2, min_user_sessions=3,
                                 drop_on=drop_on, drop_off=drop_off, drop_first=drop_first)
    train, test = split_last_session(df_preprocessed)
    valid_train, valid_test = split_last_session(train)
    test = filter_new_items(train, test)
    valid_test = filter_new_items(valid_train, valid_test)
    return (train, valid_train, valid_test, test)

##### helper function for multiple windows

In [8]:
# assign a single item to a window (from 1,...,win) based on timestamp of first item of current session
def assign_window(timestamp, cutoff_list):
    num_windows = len(cutoff_list)
    for i in range(num_windows):
        if timestamp <= cutoff_list[i]:
            window = i+1
            break
    return window

##### apply preprocessing and splitting

In [20]:
min_item_support = 5
min_session_length = 2
min_user_sessions = 3

drop_on = True
drop_off = True
drop_first = False # should always be set to False if drop_on=True

multiple_windows = True # flag for multiple windows
win = 5 # only needed if multiple_windows=True

path = '../../data/preprocessed/'

In [21]:
if multiple_windows:
    
    ts_min = data_sa.timestamp.min()
    ts_max = data_sa.timestamp.max()
    win_timespan = (ts_max-ts_min)/win
    win_cutoffs = [ts_min+(i+1)*win_timespan for i in range(win)]
    
    # create new column containing timestamp from first item of each session for each item of the session
    data_sa['window'] = data_sa['timestamp'].groupby(data_sa['sessionID']).transform('first')

    # based on timestamp from first item, assign the entire session to one of the win windows
    # to do so, apply assign_window to entire column "window"
    # this way, we never split up sessions
    data_sa['window'] = data_sa['window'].apply(lambda x: assign_window(x, win_cutoffs))
    
    for i in range(win):
        name = 'events' + '-' + str(i+1) # set up dataset name, e.g., data_sa_1 corresponding to windows 1
        df = data_sa[data_sa.window==i+1].drop('window',axis=1) # choose one single window only
        train, valid_train, valid_test, test = split_data(df,
                                                         min_item_support, min_session_length, min_user_sessions,
                                                         USER_KEY, ITEM_KEY, TIME_KEY, SESSION_KEY,
                                                         drop_on=drop_on, drop_off=drop_off, drop_first=drop_first)
        # save output to hdf files
        filename = path + str(name)
        if drop_on:
            filename += '-drop_on'
        if drop_off:
            filename += '-drop_off'
        if drop_first:
            filename += '-drop_first'
        filename += '.hdf'
        
        train.to_hdf(filename, key='train', mode='w') # create new file (to avoid adding to existing file)
        valid_test.to_hdf(filename, key='valid_test', mode='a')
        valid_train.to_hdf(filename, key='valid_train', mode='a')
        test.to_hdf(filename, key='test', mode='a')
        
else:
    train, valid_train, valid_test, test = split_data(data_sa,
                                                      min_item_support, min_session_length, min_user_sessions,
                                                      USER_KEY, ITEM_KEY, TIME_KEY, SESSION_KEY,
                                                      drop_on=drop_on, drop_off=drop_off, drop_first=drop_first)
    filename = path + 'events'
    if drop_on:
        filename += '-drop_on'
    if drop_off:
        filename += '-drop_off'
    if drop_first:
        filename += '-drop_first'
    filename += '.hdf'
    
    test.to_hdf(filename, key='test', mode='w') # create new file (to avoid adding to existing file)
    train.to_hdf(filename, key='train', mode='a')
    valid_test.to_hdf(filename, key='valid_test', mode='a')
    valid_train.to_hdf(filename, key='valid_train', mode='a')

### change column names to align with retailrocket

In [26]:
filename = 'events-1-drop_on-drop_off'
path = '../../data/preprocessed/'

In [27]:
filename = path + filename + '.hdf'

for split in ['test', 'train', 'valid_test', 'valid_train']:
    dataset = pd.read_hdf(filename, split)
    dataset = dataset.rename(columns={"userID":"visitorid", "appID":"itemid", "sessionID":"session_id"})
    if split == 'test':
        mode = 'w' # create new file for 'test', which is the first split (to avoid adding to existing file)
    else: mode = 'a' # append all other splits
    dataset.to_hdf('../../session-aware_RC_2020/code/data/retailrocket/prepared/events.hdf', key=split, mode=mode)