### Load data and build matrices

In [1]:
cd ../../../../

/Users/archnnj/Development/recsys/recsys_polimi_challenge_2018/repo


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
%matplotlib inline
import scipy.sparse as sps
from scipy.stats import iqr

import src.utils.build_icm as build_icm
from src.utils.data_splitter import train_test_holdout, train_test_user_holdout, train_test_row_holdout

In [4]:
import sys
sys.path.append("src/libs/RecSys_Course_2018/") # go to parent dir

In [5]:
from SequenceAware.sars_tutorial_master.util.data_utils import create_seq_db_filter_top_k, sequences_to_spfm_format
from SequenceAware.sars_tutorial_master.util.split import last_session_out_split
from SequenceAware.sars_tutorial_master.util.metrics import precision, recall, mrr
from SequenceAware.sars_tutorial_master.util import evaluation
from SequenceAware.sars_tutorial_master.recommenders.FSMRecommender import FSMRecommender

#### Global vars

In [6]:
JUPYTER = False

#### Load data

In [7]:
if JUPYTER:
    # Jupyter
    tracks_csv_file = "../../../data/tracks.csv"
    interactions_csv_file = "../../../data/train.csv"
    playlist_id_csv_file = "../../../data/target_playlists.csv"
    sequential_csv_file = "../../../data/train_sequential.csv"
else:
    # PyCharm
    tracks_csv_file = "data/tracks.csv"
    interactions_csv_file = "data/train.csv"
    playlist_id_csv_file = "data/target_playlists.csv"
    sequential_csv_file = "data/train_sequential.csv"

tracks_df = pd.read_csv(tracks_csv_file)
interactions_df = pd.read_csv(interactions_csv_file)
playlist_id_df = pd.read_csv(playlist_id_csv_file)
train_sequential_df = pd.read_csv(sequential_csv_file)

userList = interactions_df["playlist_id"]
itemList = interactions_df["track_id"]
ratingList = np.ones(interactions_df.shape[0])
targetsList = playlist_id_df["playlist_id"]
targetsListOrdered = targetsList[:5000].tolist()
targetsListCasual = targetsList[5000:].tolist()

userList_unique = pd.unique(userList)
itemList_unique = tracks_df["track_id"]
numUsers = len(userList_unique)
numItems = len(itemList_unique)
numberInteractions = interactions_df.size

In [8]:
URM_all = sps.coo_matrix((ratingList, (userList, itemList)))
URM_all_csr = URM_all.tocsr()

In [9]:
itemPopularity = (URM_all>0).sum(axis=0)
itemPopularity = np.array(itemPopularity).squeeze()
itemPopularity_unsorted = itemPopularity
itemPopularity = np.sort(itemPopularity)

#### Prepare ICM and URM with splits

In [10]:
import pickle

with open('dump_URM_train', 'rb') as dump_file:
    URM_train = pickle.load(dump_file)
with open('dump_URM_test', 'rb') as dump_file:
    URM_test = pickle.load(dump_file)

# 3. Fitting the recommender

Here we fit the recommedation algorithm over the sessions in the training set.

This algorithm extract Frequent Sequential Patterns from all the training sequences. Patterns are having support lower than `minsup` are discarded (support = # occurrences of a pattern in the traning data).  
Recommendations are then generated by looking for patterns having a _prefix_ corresponding to the last `[max_context, min_context]` elements in the user profile, taken in order. Matches are then sorted by decreasing _confidence_ score (ratio between the support of the matched rule and the support of the context). Matches having confidence below `minconf` are discarded.

The class `FSMRecommender` has the following initialization hyper-parameters:
* `minsup`: the minimum support threshold. It is interpreted as relative count if in \[0-1\], otherwise as an absolute count. NOTE: Relative count required for training with SPFM (faster).
* `minconf`: the minimum confidence threshold. Use to filter irrelevent recommendations.
* `max_context`: the maximum number of items in the user profile (starting from the last) that will be used for lookup in the database of frequent sequences.
* `min_context`: the minimum number of items in the user profile (starting from the last) that will be used for lookup in the database of frequent sequences.
* `spmf_path`: path to SPMF jar file. If provided, SPFM library will be used for pattern extraction (algorithm: Prefix Span). Otherwise, use pymining, which can be significantly slower depending on the sequence database size.
* `db_path`: path to the sequence database file


In [11]:
tmp_path = 'src/libs/RecSys_Course_2018/SequenceAware/tmp/'
spmf_path = 'src/libs/RecSys_Course_2018/SequenceAware/sars_tutorial_master/spmf/spmf.jar'

# then we instantiate and fit the recommender
recommender = FSMRecommender(URM_train,
                             train_sequential_df,
                             targetsListOrdered,
                             minsup=0.0005, # 0.002, 
                             minconf=1e-5, #0.1, 
                             min_context=1, 
                             max_context=10,
                             spmf_path=spmf_path,
                             tmp_path=tmp_path)

In [12]:
# calling fit() without arguments to use SPFM and the sequences stored in db_path
recommender.fit()
# calling fit() with an argument will use pymining.seqmining for pattern extraction (slower, MANDATORY ON BIDNDER) 
#recommender.fit(spmf=False)

2019-01-03 02:49:25,399 - INFO - Using SPFM (Java) for Frequent Sequence Mining


java -jar src/libs/RecSys_Course_2018/SequenceAware/sars_tutorial_master/spmf/spmf.jar run PrefixSpan src/libs/RecSys_Course_2018/SequenceAware/tmp/sequences.txt src/libs/RecSys_Course_2018/SequenceAware/tmp/tmp_output.txt 0.05%
std err:  None


2019-01-03 02:49:27,188 - INFO - 99335 frequent sequences found
2019-01-03 02:49:27,190 - INFO - Building the prefix tree
2019-01-03 02:55:45,431 - INFO - Training completed


In [18]:
#recommender.show_tree()

In [22]:
recommendations = recommender.recommend(targetsListOrdered) # tree is good, there must be sth wrong here

[12493, 17495, 13424, 7109, 14714, 15650, 15740, 159, 19243, 17920, 6620, 1063, 13808, 20023, 5893, 3483, 1775, 7660, 7478, 5649, 1461, 18190, 2494, 16890, 2204, 12393, 18418, 13640]
12493
[]


In [23]:
np.any(recommendations)

False

I didn't get any recom for any user even increasing params.. don't know :/ sth's missing

In [47]:
user_profile = recommender.train_data[recommender.train_data.user_id == targetsListOrdered[0]]['sequence'].values[0]
n = len(user_profile)
c = min(n, recommender.max_context)

In [48]:
user_profile[n - c:n]

[7478, 5649, 1461, 18190, 2494, 16890, 2204, 12393, 18418, 13640]

In [58]:
recommender.root_node

UUID('e2e7a3e9-e11d-44f6-b673-75f3f45402b0')

In [62]:
recommender.tree.

[Node(tag=root, identifier=e2e7a3e9-e11d-44f6-b673-75f3f45402b0, data=None),
 Node(tag=1, identifier=726c2477-6e46-425d-90dd-8eaa95770177, data={'support': 6}),
 Node(tag=6, identifier=38fdc77d-6b66-40c8-9ffb-e09980232c98, data={'support': 24}),
 Node(tag=12295, identifier=ed0be943-133e-410a-b7a1-b270a4e24680, data={'support': 6}),
 Node(tag=3212, identifier=f15405e3-267c-4bb9-95b9-9f0d9c4834d3, data={'support': 6}),
 Node(tag=7323, identifier=72826026-9ca9-4b89-ba25-56323ca43ab0, data={'support': 7}),
 Node(tag=13100, identifier=3c86b624-20d2-45b1-b920-ead8d9fa556b, data={'support': 5}),
 Node(tag=10694, identifier=9e5b03ef-496c-4b15-84fb-fc21a1ef743f, data={'support': 5}),
 Node(tag=15312, identifier=c721334c-996e-4948-b8ea-566daf2460f3, data={'support': 5}),
 Node(tag=14679, identifier=6e8a00f5-bf02-40f0-8b19-9095634bb1ae, data={'support': 5}),
 Node(tag=9060, identifier=bb6aec1a-a903-42fd-b7f8-351b5a74349f, data={'support': 8}),
 Node(tag=9961, identifier=c4b0e063-f61b-4f3d-9b29-d2