In [3]:
import pandas as pd
from datetime import datetime
import numpy as np
import csv

def get_query_info(query_path):
    query = pd.read_csv(query_path, sep=';')
    query_dates = dict(zip(query["queryId"], query["eventdate"]))
    query_users = dict(zip(query["queryId"], query["userId"]))
    return query_dates, query_users

In [4]:
def time_and_rating(df, rating):
    df = df.loc[:, ["itemId", "eventdate", "userId"]]
    df = df[df["userId"] >= 0] #filter Nan
    df.loc[:,"eventdate"] = df["eventdate"].apply(lambda date : datetime.fromisoformat(date))
    df = df.sort_values(by="eventdate", ascending=False)
    most_recent = list(df["eventdate"])[0]
    #get time since most recent date and then convert to int
    df.loc[:, "days since"] = df["eventdate"].apply(lambda date : int(str(most_recent - date).split()[0]))
    df["rating"] = rating
    return df

def init_arrays(records, user_i, item_i):
    num_users = len(user_i)
    num_items = len(item_i)
    #use uint8 to save memory
    T = np.full((num_users, num_items), 255, dtype="uint8")
    C = np.zeros((num_users, num_items), dtype="uint8")    
    for row in records:
        user, item, time, rating = row[0], row[1], row[2], row[3]
        i = user_i[user]
        j = item_i[item]
        best_rating = C[i][j]
        best_time = T[i][j]
        if rating >= best_rating and time < best_time:
            C[i][j] = rating
            T[i][j] = time
    return C, T

def build_arrays(combined):
    combined_items = list(combined["itemId"])
    unique_items = list(set(combined_items))
    combined_users = [int(u) for u in list(combined["userId"])]
    unique_users = list(set(combined_users))
    combined_time = list(combined["days since"])
    combined_ratings = list(combined["rating"])
    records = zip(combined_users, combined_items, combined_time, combined_ratings)
    item_i = {item : i for i, item in enumerate(unique_items)}
    user_i = {user : i for i, user  in enumerate(unique_users)}
    C, T = init_arrays(records, user_i, item_i)
    return C, T, user_i, item_i

"""This function builds the arrays C and T from 4 raw data files. Although train-queries is not used directly,
it has the informations linking query and time that allows us to assign dates and users to clicks"""
def main_extraction(view_path='../data/train-item-views.csv', purchase_path='../data/train-purchases.csv',
                    click_path='../data/train-clicks.csv', query_path="../data/train-queries.csv"):
    query_dates, query_users = get_query_info(query_path)
    view = pd.read_csv(view_path, sep=';')
    purchase = pd.read_csv(purchase_path, sep=';')
    click = pd.read_csv(click_path, sep=';')
    click.loc[:,"eventdate"] = click["queryId"].apply(lambda q : query_dates[q])
    click.loc[:,"userId"] = click["queryId"].apply(lambda q : query_users[q])
    #Purchases > views > clicks > nothing
    purchases = time_and_rating(purchase, 3)
    views = time_and_rating(view, 2)
    clicks = time_and_rating(click, 1)
    data = [clicks, views, purchases]
    combined = pd.concat(data)
    C, T, users, items = build_arrays(combined)
    return C,T, users, items






In [5]:
#Matrices and dictionaries converting user/item IDs to indices
C,T, users, items = main_extraction()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
#np.save("rating_matrix.npy", C)
#np.save("time_matrix.npy", T)
#with open('user_index.csv','w') as f:
#    w = csv.writer(f)
#    w.writerows(users.items())
#with open('item_index.csv','w') as f:
#    w = csv.writer(f)
#    w.writerows(items.items())