In [33]:
import numpy as np
import csv
import utils
import pandas as pd
import scipy.sparse as sparse
import scipy.sparse.linalg as linalg
import itertools
import time
from random import randint

In [7]:
def make_valid_set(savepath, df_orders, df_order_product_train):
    # select eval set == train
    df_order_user_curr = df_orders.loc[df_orders.eval_set == "train"].reset_index()
    # only select order_id and user_id
    df_order_user_curr = df_order_user_curr[["order_id", "user_id"]]
    df_order_product_valid = df_order_product_train[["order_id", "product_id"]]
    df_order_product_valid = df_order_product_valid.groupby("order_id")["product_id"].apply(list).reset_index().rename(
        columns={"product_id": "products"})

    # Merge on order id
    df_user_product_valid = pd.merge(df_order_user_curr, df_order_product_valid, on="order_id")
    df_user_product_valid = df_user_product_valid[["user_id", "products"]]

    df_user_product_valid.to_csv(savepath, index_label=False)
    return df_user_product_valid

In [8]:
def make_user_product_prior(savepath, df_orders, df_order_products_prior):
    # select eval set == prior
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    # only select order_id and user_id
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    # merge order_user with order_products
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    # select user_id and product_id only
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    # group the same user_id and product_id
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(
        columns={0: "quantity"})
    df_user_product_prior.to_csv(savepath, index_label=False)
    return df_user_product_prior

In [9]:
def get_user_product_matrix(savepath, df_user_product_prior):
    user_product_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                             (df_user_product_prior["product_id"].cat.codes.copy(),
                                              df_user_product_prior["user_id"].cat.codes.copy())))
    sparse.save_npz(savepath, user_product_matrix)
    return user_product_matrix

In [10]:
def spars(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return sparsity

In [11]:
# Order datasets
df_order_product_prior = pd.read_csv("order_products__prior.csv")
df_order_product_train = pd.read_csv("order_products__train.csv")
df_orders = pd.read_csv("orders.csv")
print('orders shape', df_orders.shape)

# Products
df_products = pd.read_csv("products.csv")

# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_product_prior, df_products, on="product_id", how="left")

orders shape (3421083, 7)


In [13]:
# make user_product validation set with number of reorder matrix
valid_exist = False
if valid_exist:
    df_user_product_valid = pd.read_csv("data1/user_product_valid.csv")
else:
    df_user_product_valid = make_valid_set("data1/user_product_valid.csv",
                                           df_orders, df_order_product_train)
print('user_product_valid is done')
print('valid shape', df_user_product_valid.shape)

# make user_product train set with number of reorder matrix
train_exist = False
if train_exist:
    df_user_product_train = pd.read_csv("data1/user_product_prior.csv").astype('category')
else:
    df_user_product_train = make_user_product_prior('data1/user_product_prior.csv',
                                                    df_orders, df_order_product_prior).astype("category")
print('user_product_train is done')
print('train shape', df_user_product_train.shape)

# make utility matrix
matrix_exist = False
if matrix_exist:
    user_product_matrix = sparse.load_npz("data1/product_user_matrix.npz").tocsr().astype(np.float32)
else:
    user_product_matrix = get_user_product_matrix("data1/product_user_matrix.npz", df_user_product_train)
print('utility matrix is done')
print('sparsity of the utility matrix is', spars(user_product_matrix))



user_product_valid is done
valid shape (131209, 2)
user_product_train is done
train shape (13307953, 3)
utility matrix is done
sparsity of the utility matrix is 99.8700882953749


In [14]:
class TopNRecommendation(object):
    def __init__(self, product_factors, user_factors, product_user_matrix):
        self.product_factors = product_factors
        self.user_factors = user_factors
        self.product_user_matrix = product_user_matrix

    def recommend(self, user_id, N=10):
        """
        Finds top N Recommendations
        """
        scores = self.user_factors[user_id] @ self.product_factors.T
        best = np.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

    def recommend_new(self, user_id, N=10):
        """
        Finds Top N new Recommendations
        """
        scores = self.user_factors[user_id] @ self.product_factors.T
        bought_indices = self.product_user_matrix.T[user_id].nonzero()[1]
        count = N + len(bought_indices)
        ids = np.argpartition(scores, -count)[-count:]
        best = sorted(zip(ids, scores[ids]), key=lambda x: -x[1])
        return list(itertools.islice((rec for rec in best if rec[0] not in bought_indices), N))


In [15]:
def map_matrix_id(df_user_product_prior):
    u_dict = {uid: i for i, uid in enumerate(df_user_product_prior["user_id"].cat.categories)}
    p_dict = dict(enumerate(df_user_product_prior["product_id"].cat.categories))
    return u_dict, p_dict

In [16]:
def actual_products(df_user_product_valid, df_products, user_id):
    # Actual
    row = df_user_product_valid.loc[df_user_product_valid.user_id == user_id]
    actual = list(row["products"])
    actual = actual[0][1:-1]
    actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))
    act_products = []
    for pid in actual:
        act_products.extend(df_products.loc[df_products.product_id == pid].product_name.tolist())
    print("Actual products bought by user {}\n{}\n\n".format(user_id, act_products))


In [17]:
def rec_products(recommendations, df_products, p_dict, user_id):
    # All Products Recommended
    rec_products = []
    for rec in recommendations:
        print(rec)
        rec_products.extend(df_products.loc[p_dict[rec[0]] == df_products.product_id].product_name.tolist())
    print("All products recommended to user {}\n{}\n\n".format(user_id, rec_products))


In [18]:
def get_k_popular(k, df_order_product_prior):
    popular_products = list(df_order_product_prior["product_id"].value_counts().head(k).index)
    return popular_products

In [19]:
def print_goods(popular_products):
    print('10 most popular products on the platform is,')
    popular_goods = []
    print(popular_products)
    for rec in popular_products:
        print(rec)
        popular_goods.extend(df_products.loc[p_dict[rec]+1 == df_products.product_id].product_name.tolist())
    print(popular_goods)


In [20]:
def recall(bought, pred):
    if len(bought) == 0:
        return 0
    bought, pred = set(bought), set(pred)
    return len(bought.intersection(pred)) / len(bought)

In [21]:
def precision(bought, pred):
    if len(pred) == 0:
        return 0
    bought, pred = set(bought), set(pred)
    return len(bought.intersection(pred))/len(pred)

In [22]:
def f1(bought, pred):
    a = precision(bought, pred)
    b = recall(bought, pred)
    if a+b == 0:
        return 0
    else:
        return 2 * (a * b)/(a + b)

In [23]:
def new_purchase_row(row):
    """
    Given a row in the validation set
    Returns the list of new products purchased
    """
    actual = row["products"][1:-1]  # Products purchased currently
    actual = set([int(p.strip()) for p in actual.strip().split(",")])
    liked = set([p_dict[i] for i in user_product_matrix[u_dict[row["user_id"]]].indices])  # User's purchase history
    new_purchase = actual - liked
    return new_purchase

In [24]:
def popular_recommend(row):
    """
    Given a row in the test dataset
    Returns the recall score when popular products are recommended
    """
    actual = new_purchase_row(row)
    return f1(actual, popular_products)

In [25]:
def svd_recommend_new(row):
    """
    Given a row in the test dataset
    Returns the recall score when our model recommends new products
    """
    actual = new_purchase_row(row)
    recommended = svd_rec.recommend_new(u_dict[row["user_id"]], N=10)
    recommended = [p_dict[r[0]] for r in recommended]
    return f1(actual, recommended)

In [26]:
def build_eval_df(user_product_validation, n1, n2):
    start = time.time()
    print("Making prediction on validation data ...")
    df_eval = user_product_validation[n1:n2].copy()
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval["svd_new_score"] = df_eval.apply(svd_recommend_new, axis=1)
    print("Completed in {:.2f}s".format(time.time() - start))
    return df_eval

In [30]:
# Order datasets
df_order_products_prior = pd.read_csv("order_products__prior.csv")
df_order_products_train = pd.read_csv("order_products__train.csv")
df_orders = pd.read_csv("orders.csv")

# Products
df_products = pd.read_csv("products.csv")

In [28]:
# user_product sets
df_user_product_valid = pd.read_csv("data1/user_product_valid.csv").astype('category')
df_user_product_train = pd.read_csv("data1/user_product_prior.csv").astype('category')
product_user_matrix = sparse.load_npz("data1/product_user_matrix.npz").tocsr().astype(np.float32)
user_product_matrix = product_user_matrix.T.tocsr()
u_dict, p_dict = map_matrix_id(df_user_product_train)

In [31]:
# print out the 10 most popular products on the platform
top_k = 10
popular_products = get_k_popular(top_k, df_order_products_prior)
print_goods(popular_products)

10 most popular products on the platform is,
[24852, 13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]
24852
13176
21137
21903
47209
47766
47626
16797
26209
27845
['Raspberry Filmjolk Non-Fat Drinkable Yogurt', 'Country Loaf', 'Brown Fig', 'Chile Ancho', 'Premium Roast Decaf K Cup', 'Sahara Pita Pockets 100% Whole Wheat', 'Stainless Steel Sink Strainer', 'Grands! Homestyle Original', 'Quick Oats Hot Cereal', 'Diet Soda Cans']


In [34]:
# tune factor number
lambdas = [3, 5, 10, 25]
validation_mean = []
baseline_mean = []
for lam in lambdas:
    product_factors, sigma, user_factors = linalg.svds(product_user_matrix, lam)
    user_factors = user_factors.T * sigma
    svd_rec = TopNRecommendation(product_factors, user_factors, product_user_matrix)
    v_mean_per_lam = []
    b_mean_per_lam = []
    for i in range(1):
        k = randint(1, 2000)
        validation_score1 = build_eval_df(df_user_product_valid, 20001, 40000)
        validation_mean_svd_1 = np.mean(validation_score1["svd_new_score"])
        baseline_mean_1 = np.mean(validation_score1["popular_score"])
        v_mean_per_lam.append(validation_mean_svd_1)
        b_mean_per_lam.append(baseline_mean_1)
    v_mean = sum(v_mean_per_lam)/len(v_mean_per_lam)
    b_mean = sum(b_mean_per_lam)/len(b_mean_per_lam)
    validation_mean.append(v_mean)
    baseline_mean.append(b_mean)
print("svd f1 score", validation_mean)
print("Baseline f1 score", baseline_mean)

Making prediction on validation data ...
Completed in 500.04s
Making prediction on validation data ...
Completed in 488.16s
Making prediction on validation data ...
Completed in 483.12s
Making prediction on validation data ...
Completed in 495.62s
svd f1 score [0.020046823925704536, 0.01996845858230009, 0.01888991423579873, 0.016593414733182616]
Baseline f1 score [0.015445666149210972, 0.015445666149210972, 0.015445666149210972, 0.015445666149210972]


|     Method                | Validation Set 1 | Validation Set 2 |
|---------------------------|------------------|------------------|
|     Baseline              | 1.54%            | 1.53%            |
|     TF-IDF                | 15.86%           | 15.57%           |
|     SVD with 3 factors    | 2.00%            | 2.00%            |
|     SVD with 5 factors    | 1.99%            | 1.98%            |