In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import csv
from collections import defaultdict

def get_query_info(query_path):
    query = pd.read_csv(query_path, sep=';')
    query_dates = dict(zip(query["queryId"], query["eventdate"]))
    query_users = dict(zip(query["queryId"], query["userId"]))
    return query_dates, query_users

In [7]:
def time_and_rating(df, rating):
    df = df.loc[:, ["itemId", "eventdate", "userId"]]
    df = df[df["userId"] >= 0] #filter Nan
    df.loc[:,"eventdate"] = df["eventdate"].apply(lambda date : datetime.fromisoformat(date))
    df = df.sort_values(by="eventdate", ascending=False)
    most_recent = list(df["eventdate"])[0]
    #get time since most recent date and then convert to int
    df.loc[:, "days since"] = df["eventdate"].apply(lambda date : int(str(most_recent - date).split()[0]))
    df["rating"] = rating
    return df

def init_arrays(records, user_i, item_i):
    num_users = len(user_i)
    num_items = len(item_i)
    #use uint8 to save memory
    T = np.full((num_users, num_items), 255, dtype="uint8")
    C = np.zeros((num_users, num_items), dtype="uint8")
    for row in records:
        user, item, time, rating = row[0], row[1], row[2], row[3]
        #only most popular users and items
        if user in user_i.keys() and item in item_i.keys():
            i = user_i[user]
            j = item_i[item]
            best_rating = C[i][j]
            best_time = T[i][j]
            if rating >= best_rating and time < best_time:
                C[i][j] = rating
                T[i][j] = time
    return C, T

def get_most_relevant(dct, count):
    most_relevant = sorted(dct.items(), key = lambda x : x[1], reverse=True)
    print(most_relevant[:10])
    most_relevant = [element for element, _ in most_relevant]
    return most_relevant[:count]

def reduce_dimensionality(records, unique_users, unique_items):
    active_users = {user : 0 for user in unique_users}
    for row in records:
        user, rating = row[0], row[3]
        active_users[user] += rating
    active_users = get_most_relevant(active_users, 1000)
    most_popular_items = {item : 0 for item in unique_items}  
    
    for row in records:
        user, item, rating = row[0], row[1], row[3]
        if user in active_users:
            most_popular_items[item] += rating
            
    most_popular_items = get_most_relevant(most_popular_items, 3000)
    return active_users, most_popular_items

def build_arrays(combined):
    combined_items = list(combined["itemId"])
    unique_items = list(set(combined_items))
    combined_users = [int(u) for u in list(combined["userId"])]
    unique_users = list(set(combined_users))
    combined_time = list(combined["days since"])
    combined_ratings = list(combined["rating"])
    records = list(zip(combined_users, combined_items, combined_time, combined_ratings))
    active_users, popular_items = reduce_dimensionality(records, unique_users, unique_items)
    item_i = {item : i for i, item in enumerate(popular_items)}
    user_i = {user : i for i, user  in enumerate(active_users)}
    C, T = init_arrays(records, user_i, item_i)
    return C, T, user_i, item_i

"""This function builds the arrays C and T from 4 raw data files. Although train-queries is not used directly,
it has the informations linking query and time that allows us to assign dates and users to clicks"""
def main_extraction(view_path='../data/train-item-views.csv', purchase_path='../data/train-purchases.csv',
                    click_path='../data/train-clicks.csv', query_path="../data/train-queries.csv"):
    query_dates, query_users = get_query_info(query_path)
    view = pd.read_csv(view_path, sep=';')
    purchase = pd.read_csv(purchase_path, sep=';')
    click = pd.read_csv(click_path, sep=';')
    click.loc[:,"eventdate"] = click["queryId"].apply(lambda q : query_dates[q])
    click.loc[:,"userId"] = click["queryId"].apply(lambda q : query_users[q])
    #Purchases > views > clicks > nothing
    purchases = time_and_rating(purchase, 3)
    views = time_and_rating(view, 2)
    clicks = time_and_rating(click, 1)
    data = [clicks, views, purchases]
    combined = pd.concat(data)
    C,T, users, items = build_arrays(combined)
    return C,T, users, items






In [8]:
#Matrices and dictionaries converting user/item IDs to indices
C,T, users, items = main_extraction()

  exec(code_obj, self.user_global_ns, self.user_ns)


[(17732, 655), (4094, 597), (20346, 515), (13255, 435), (24034, 428), (25480, 428), (22489, 410), (19341, 374), (21156, 367), (21634, 359)]
<class 'int'>
[(40156, 233), (15632, 117), (36956, 90), (34157, 84), (4953, 67), (14614, 66), (32394, 64), (32769, 64), (31013, 59), (14889, 56)]


In [14]:
np.save("rating_matrix.npy", C)
np.save("time_matrix.npy", T)
with open('user_index.csv','w') as f:
    w = csv.writer(f)
    w.writerows(users.items())
with open('item_index.csv','w') as f:
    w = csv.writer(f)
    w.writerows(items.items())

(1000, 3000) (1000, 3000)
[array([0, 0, 1, ..., 0, 1, 0], dtype=uint8)]
[array([246, 246,  12, ..., 246,  63, 246], dtype=uint8)]
