In [1]:
## General libraries
import numpy as np
import tensorflow as tf
import pandas as pd
import collections
import math
import pprint
import pickle

## My libraries 
import comms

In [2]:
np.random.seed(0)

Generate data 
-- 
- the data is list of user sessions, we define session as series of views that are not separated by no more than 1/2 hour (we are talking about consequtive views)

- you better have a lot of RAM for this 


In [3]:
## Load data, split into users 
data = comms.load_jsons("products.json") ## list of jsons, TESTING:    pprint.pprint(data[0], depth=1)

user_item_time_df = comms.user_item_time(data)

word_bags, date_bags, context_ids = comms.create_timed_contexts(user_item_time_df, remove_treshold = 1)

In [4]:
def make_cutted(bag_ind, treshold = 0): 
    """
    A little dirty solution here 
    Takes index of user and returns all his visits
    The visits themselves and items in each visit are sorted chronologically 
    visit = sequence of views where consequtive views are separated by at most 1/2 hour
    
    Arguments:
    bag_ind -- integer, just number from 0 to len(word_bags), 
               where word bags is the list from above cell
    treshold -- remove user visits with length of treshold or less

    Return:
    db_cut -- list of lists of dates, 
              each inner list is one visit of user corresponding to bag_ind
              and is sorted chronologically
    wb_cut -- list of lists of items
    cont -- list of user id, contains the same id repeated len(wb_cut) number of times
    """
    datebag = date_bags[bag_ind]
    wordbag = word_bags[bag_ind]
    argsor = np.argsort(datebag)
    datebag_sorted = np.array([datebag[i] for i in argsor])
    wordbag_sorted = np.array([wordbag[i] for i in argsor])

    half_hour = 30*60

    cuts = np.argwhere(np.array([(datebag_sorted[i+1] - datebag_sorted[i]).seconds>half_hour for i in range(len(datebag)-1)]))
    cuts = cuts + 1 
    cuts = np.pad(cuts.flatten(), 1, "constant")
    cuts[-1] = len(datebag)

    datebag_cutted = [[datebag_sorted[cuts[i] : cuts[i+1]]] for i in range(len(cuts)-1)]
    wordbag_cutted = [[wordbag_sorted[cuts[i] : cuts[i+1]]] for i in range(len(cuts)-1)]
        
    db_cut = [list(x[0]) for x in datebag_cutted if len(x[0]) > treshold]
    wb_cut = [list(x[0]) for x in wordbag_cutted if len(x[0]) > treshold]
    cont = [context_ids[bag_ind] for i in range(len(db_cut))] 
    return db_cut, wb_cut, cont

In [5]:
## Split users history into visits
date_bags_sess = []
word_bags_sess = []
context_ids_sess = []

for i in range(len(word_bags)):
    db_cut, wb_cut, cont = make_cutted(i)
    date_bags_sess.append(db_cut)
    word_bags_sess.append(wb_cut)
    context_ids_sess.append(cont)

In [6]:
num_client = len(date_bags_sess)

In [7]:
test_size = 500

In [8]:
## Split data to train and test set
indices_test = np.random.choice(num_client, test_size, replace=False)
indices_train = [i for i in range(num_client) if i not in indices_test]

In [9]:
date_bags_train = [date_bags_sess[i] for i in indices_train]
word_bags_train = [word_bags_sess[i] for i in indices_train]
context_ids_train = [context_ids_sess[i] for i in indices_train]

date_bags_test = [date_bags_sess[i] for i in indices_test]
word_bags_test = [word_bags_sess[i] for i in indices_test]
context_ids_test = [context_ids_sess[i] for i in indices_test]

In [10]:
train = date_bags_train, word_bags_train, context_ids_train
test = date_bags_test, word_bags_test, context_ids_test

In [11]:
pickle.dump(train,open( "train_sessions.pkl", "wb"))
pickle.dump(test, open("test_sessions.pkl", "wb"))