# Building up sequence utils

### Load data and build matrices

In [1]:
cd ../../../../

/Users/archnnj/Development/recsys/recsys_polimi_challenge_2018/repo


In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
%matplotlib inline
import scipy.sparse as sps
from scipy.stats import iqr
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":12,"axes.titlesize":12,"axes.labelsize":12})

import src.utils.build_icm as build_icm
from src.utils.data_splitter import train_test_holdout, train_test_user_holdout, train_test_row_holdout

In [38]:
import sys
sys.path.append("src/libs/RecSys_Course_2018/") # go to parent dir

In [39]:
from SequenceAware.sars_tutorial_master.util.data_utils import create_seq_db_filter_top_k, sequences_to_spfm_format

#### Global vars

In [40]:
JUPYTER = False

#### Load data

In [41]:
if JUPYTER:
    # Jupyter
    tracks_csv_file = "../../../data/tracks.csv"
    interactions_csv_file = "../../../data/train.csv"
    playlist_id_csv_file = "../../../data/target_playlists.csv"
    sequential_csv_file = "../../../data/train_sequential.csv"
else:
    # PyCharm
    tracks_csv_file = "data/tracks.csv"
    interactions_csv_file = "data/train.csv"
    playlist_id_csv_file = "data/target_playlists.csv"
    sequential_csv_file = "data/train_sequential.csv"

tracks_df = pd.read_csv(tracks_csv_file)
interactions_df = pd.read_csv(interactions_csv_file)
playlist_id_df = pd.read_csv(playlist_id_csv_file)
train_sequential_df = pd.read_csv(sequential_csv_file)

userList = interactions_df["playlist_id"]
itemList = interactions_df["track_id"]
ratingList = np.ones(interactions_df.shape[0])
targetsList = playlist_id_df["playlist_id"]
targetsListOrdered = targetsList[:5000].tolist()
targetsListCasual = targetsList[5000:].tolist()

userList_unique = pd.unique(userList)
itemList_unique = tracks_df["track_id"]
numUsers = len(userList_unique)
numItems = len(itemList_unique)
numberInteractions = interactions_df.size

In [42]:
URM_all = sps.coo_matrix((ratingList, (userList, itemList)))
URM_all_csr = URM_all.tocsr()

In [43]:
itemPopularity = (URM_all>0).sum(axis=0)
itemPopularity = np.array(itemPopularity).squeeze()
itemPopularity_unsorted = itemPopularity
itemPopularity = np.sort(itemPopularity)

#### Prepare ICM and URM with splits

In [44]:

# Build ICM
ICM_all = build_icm.build_icm(tracks_df, split_duration_lenght=800, feature_weights={'albums': 1, 'artists': 0.5, 'durations': 0.1})

IDF_ENABLED = True

if IDF_ENABLED:
    num_tot_items = ICM_all.shape[0]
    # let's count how many items have a certain feature
    items_per_feature = (ICM_all > 0).sum(axis=0)
    IDF = np.array(np.log(num_tot_items / items_per_feature))[0]
    ICM_idf = ICM_all.copy()
    # compute the number of non-zeros in each col
    # NOTE: this works only if X is instance of sparse.csc_matrix
    col_nnz = np.diff(sps.csc_matrix(ICM_idf).indptr)
    # then normalize the values in each col
    ICM_idf.data *= np.repeat(IDF, col_nnz)
    ICM_all = ICM_idf  # use IDF features

# #### Build URM

URM_all = sps.coo_matrix((ratingList, (userList, itemList)))
URM_all_csr = URM_all.tocsr()

URM_IDF_ENABLED = False

if URM_IDF_ENABLED:
    num_tot_items = URM_all.shape[0]
    # let's count how many items have a certain feature
    items_per_feature = (URM_all > 0).sum(axis=0)
    IDF = np.array(np.log(num_tot_items / items_per_feature))[0]
    URM_idf = URM_all.copy()
    # compute the number of non-zeros in each col
    # NOTE: this works only if X is instance of sparse.csc_matrix
    col_nnz = np.diff(sps.csc_matrix(URM_idf).indptr)
    # then normalize the values in each col
    URM_idf.data *= np.repeat(IDF, col_nnz)
    URM_all = URM_idf  # use IDF features

# #### Train/test split: ratings and user holdout

seed = 0
# ratings holdout
# URM_train, URM_test_pred = train_test_holdout(URM_all, train_perc=0.8, seed=seed)
# URM_valid=URM_test_pred
# URM_test_known = None

# user holdout
# URM_train, URM_test_known, URM_test_pred = train_test_user_holdout(URM_all, user_perc=0.8, train_perc=0.8, seed=seed)

# row holdout
# URM_train, URM_test_pred = train_test_row_holdout(URM_all, userList_unique, train_sequential_df, train_perc=0.8, seed=seed, targetsListOrdered=targetsListOrdered, nnz_threshold=10)
# URM_test_known = None

# row holdout - validation
# URM_train_val, URM_test_pred = train_test_row_holdout(URM_all, userList_unique, train_sequential_df, train_perc=0.8,
#                                                       seed=seed, targetsListOrdered=targetsListOrdered,
#                                                       nnz_threshold=10)
# URM_train, URM_valid = train_test_holdout(URM_train_val, train_perc=0.7, seed=seed)
# URM_test_known = None
URM_train, URM_valid_test_pred = train_test_row_holdout(URM_all, userList_unique, train_sequential_df,
                                                        train_perc=0.6,
                                                        seed=seed, targetsListOrdered=targetsListOrdered,
                                                        nnz_threshold=2)
URM_valid, URM_test_pred = train_test_row_holdout(URM_valid_test_pred, userList_unique, train_sequential_df,
                                                  train_perc=0.5,
                                                  seed=seed, targetsListOrdered=targetsListOrdered,
                                                  nnz_threshold=1)


URM_train = URM_train
URM_validation = URM_valid
URM_test = URM_test_pred

### Utils

In [45]:
def get_urm_tuples_dict_uip_generator(URM, ICM=None):
    URM = URM.tocsr()
    if ICM is None:
        return ({'user': u, 'item': URM.indices[ind]} for u in range(URM.shape[0])
                for ind in range(URM.indptr[u], URM.indptr[u + 1]))
    else:
        return ({'user': u,
                 'item': URM.indices[ind],
                 'album': ICM.indices[ICM.indptr[u]],
                 'artist': ICM.indices[ICM.indptr[u]+1],
                 'duration': ICM.indices[ICM.indptr[u]+2]
                 } for u in range(URM.shape[0]) for ind in range(URM.indptr[u], URM.indptr[u + 1]))

In [46]:
def get_all_user_tuples_dict_uip_generator(u_arr, URM, ICM=None):
    URM = URM.tocsr()
    # if ICM is None:
    return ({'user': u, 'item': i} for u in u_arr for i in range(URM.shape[1]))
    # else:
    #     return ({'user': u,
    #              'item': URM.indices[ind],
    #              'album': ICM.indices[ICM.indptr[u]],
    #              'artist': ICM.indices[ICM.indptr[u]+1],
    #              'duration': ICM.indices[ICM.indptr[u]+2]
    #              }#  for ind in range(URM.indptr[u], URM.indptr[u + 1]))


In [88]:
def get_sequences_df(URM, train_sequential_df, user_arr):
    sequences_arr = []
    for user_id in user_arr:
        URM_start_pos = URM.indptr[user_id]
        URM_end_pos = URM.indptr[user_id + 1]

        original_order_items = np.array(
            train_sequential_df.loc[train_sequential_df['playlist_id'].isin([user_id])]['track_id'])
        original_order_items = np.array([i for i in original_order_items if i in URM.indices[URM_start_pos:URM_end_pos]]) # filter items in urm
        # original_order_items_sorted_i = original_order_items.argsort()

        sequences_arr.append({'user_id':user_id, 'sequence': original_order_items.tolist()})
    sequences = pd.DataFrame(sequences_arr)
    sequences = sequences[['user_id', 'sequence']]
    return sequences

#### Create Sequence

In [91]:
sequences = get_sequences_df(URM_train, train_sequential_df, targetsListOrdered)

In [92]:
sequences.head()

Unnamed: 0,user_id,sequence
0,7,"[12493, 17495, 13424, 7109, 14714, 15650, 1574..."
1,25,"[13805, 7035, 2336, 16663, 4720, 19967, 12384,..."
2,29,"[17056, 16912, 14907, 4148, 15430, 4543, 16387..."
3,34,"[16759, 9266, 13302, 18536, 8248, 523, 11656, ..."
4,50,"[14417, 19813, 5281, 6245, 2388, 13452, 153]"
