#Collaborative Filtering - Implicit model (NJ)

Main paper: http://yifanhu.net/PUB/cf.pdf

https://jessesw.com/Rec-System/

https://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb

https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65

https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/



In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Nikhil's working path
%cd drive/My\ Drive/'Captstone(297)'/data

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1auWpfpINZvmJbl0I50kYmpS29bbnesqp/Captstone(297)/data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
import scipy.sparse as sparse

In [None]:
! pip install implicit
import implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/bc/07/c0121884722d16e2c5beeb815f6b84b41cbf22e738e4075f1475be2791bc/implicit-0.4.4.tar.gz (1.1MB)
[K     |▎                               | 10kB 27.4MB/s eta 0:00:01[K     |▋                               | 20kB 19.4MB/s eta 0:00:01[K     |▉                               | 30kB 16.3MB/s eta 0:00:01[K     |█▏                              | 40kB 13.0MB/s eta 0:00:01[K     |█▌                              | 51kB 10.1MB/s eta 0:00:01[K     |█▊                              | 61kB 10.9MB/s eta 0:00:01[K     |██                              | 71kB 10.6MB/s eta 0:00:01[K     |██▍                             | 81kB 10.7MB/s eta 0:00:01[K     |██▋                             | 92kB 9.7MB/s eta 0:00:01[K     |███                             | 102kB 9.8MB/s eta 0:00:01[K     |███▎                            | 112kB 9.8MB/s eta 0:00:01[K     |███▌                            | 122kB 9.8MB/s eta 0:0

In [None]:
df = pd.read_csv("merged_user_data_rm_suspicious.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Subsetting to Counties in NJ with most data

In [None]:
nj_n_counties = ['Hudson','Bergen','Passaic','Essex','Union','Middlesex','Monmouth','Mercer','Hunterdon','Warren','Sussex','Morris','Somerset']
nj_df = df.copy(deep=True)
nj_df = nj_df[(nj_df['county'].isin(nj_n_counties)) & (nj_df['state']=='NJ')]

In [None]:
nj_df.shape

(108847, 63)

## Bringing in Score Logic from `AllData` Notebook

In [None]:
special_actions_level0 = ['scroll','pageview','ViewContent','view_item','drag','click']

# open-form (questions), (showing, showing-agent) = enter-flow = phone-click
# nav-tab (schools-and-comps), success(favorite), change(payment-calculator)
special_actions_level1 = ['open-form','enter-flow',  'nav-tab', 'change',
                          'success', 'Search', 'view_search_results', 'phone-click']
# 'nav' (insidere-details) = lead, submit = submit-success  (showing, showing-agent, buyer-lead-form)
special_actions_level2 = ['nav', 'lead', 'submit', 'submit-success','add_to_cart', 'InitiateCheckout',]
special_actions_level3 = ['Purchase','purchase']

In [None]:
action_ct = nj_df.action.value_counts()
total_ct = action_ct.sum()
level_weights = []
for i, curr_level in enumerate([special_actions_level0, special_actions_level1, 
                           special_actions_level2, special_actions_level3]):
  # futureproofing index lookup for cases when not all actions are there
  Lx_weight = 100*action_ct[action_ct.index.isin(curr_level)].sum()/total_ct
  level_weights.append(Lx_weight)
  print("Level {0:d}: {1:.4f} %".format(i+1, Lx_weight))

Level 1: 99.3707 %
Level 2: 0.5448 %
Level 3: 0.0772 %
Level 4: 0.0073 %


In [None]:
group_df_pre = nj_df.groupby(by=['ip', 'rexUrl']).agg(scroll_ct = ('action',lambda x: sum(x=="scroll")),
                                                   pageview_ct = ('action',lambda x: sum(x=="pageview")),
                                                   viewcontent_ct = ('action',lambda x: sum(x=='ViewContent')),
                                                   view_item_ct = ('action',lambda x: sum(x=='view_item')),
                                                   drag_ct = ('action',lambda x: sum(x=='drag')),
                                                   click_ct = ('action',lambda x: sum(x=='click')),
                                                   level0_ct = ('action', lambda x: sum(np.isin(x, special_actions_level0))),
                                                   level1_ct = ('action', lambda x: sum(np.isin(x, special_actions_level1))),
                                                   level2_ct = ('action', lambda x: sum(np.isin(x, special_actions_level2))),
                                                   level3_ct = ('action', lambda x: sum(np.isin(x, special_actions_level3))))

In [None]:
def calculate_score(row):
  curr_score = 0
  for i in range(4):
    curr_weight = np.clip(1/level_weights[i],0,6)
    curr_score += row[f'level{i}_ct']*curr_weight
  return curr_score

# Based on Selina score rule
group_df_pre['score'] = group_df_pre.apply(lambda x: calculate_score(x), axis=1)

## input for CF model
df_grp = group_df_pre.reset_index()[['ip', 'rexUrl','score']]
print("Input for CF model: ", df_grp.shape)
df_grp.head()

Input for CF model:  (4959, 3)


Unnamed: 0,ip,rexUrl,score
0,10.70.33.53,57-miry-brook-rd,20.288381
1,100.1.107.229,106-108-parkview-ave,0.03019
2,100.1.107.229,265-custer-ave-apt-116,0.06038
3,100.1.107.229,433-w-1st-ave,0.171077
4,100.1.107.229,911-kennedy-blvd-913,0.201267


In [None]:
# View frequency approach
df_grp = nj_df.groupby(['ip','rexUrl'])['event_stamp'].count()
df_grp = df_grp.reset_index()
df_grp.head()

Unnamed: 0,ip,rexUrl,event_stamp
0,10.70.33.53,57-miry-brook-rd,49
1,100.1.107.229,106-108-parkview-ave,3
2,100.1.107.229,265-custer-ave-apt-116,6
3,100.1.107.229,433-w-1st-ave,17
4,100.1.107.229,911-kennedy-blvd-913,20


## Implicit ALS

In [None]:
 # making unique codes
 ip_cats = df_grp.ip.astype('category')
 ip_id = dict(enumerate(ip_cats.cat.categories))

 url_cats = df_grp.rexUrl.astype('category')
 url_id = dict(enumerate(url_cats.cat.categories))

 df_grp['user_id'] = df_grp['ip'].astype("category").cat.codes
 df_grp['house_id'] = df_grp['rexUrl'].astype("category").cat.codes

In [None]:
print('Num Unique Homes', df_grp.rexUrl.nunique())
print('Num Unique Users', df_grp.ip.nunique())

Num Unique Homes 107
Num Unique Users 3518


In [None]:
# Function from implicit package that hasn't been pushed to the package's master yet, but exists on a branch
import sklearn
def train_test_split_imp(ratings, train_percentage=0.8, random_state=None):
    """ Randomly splits the ratings matrix into two matrices for training/testing.
    Parameters
    ----------
    ratings : coo_matrix
        A sparse matrix to split
    train_percentage : float
        What percentage of ratings should be used for training
    random_state : int, None or RandomState
        The existing RandomState. If None, or an int, will be used
        to seed a new numpy RandomState.
    Returns
    -------
    (train, test) : csr_matrix, csr_matrix
        A tuple of csr_matrices for training/testing """

    ratings = ratings.tocoo()
    random_state = sklearn.utils.check_random_state(random_state)
    random_index = random_state.random_sample(len(ratings.data))
    train_index = random_index < train_percentage
    test_index = random_index >= train_percentage

    train = sparse.csr_matrix((ratings.data[train_index],
                        (ratings.row[train_index], ratings.col[train_index])),
                       shape=ratings.shape, dtype=ratings.dtype)

    test = sparse.csr_matrix((ratings.data[test_index],
                       (ratings.row[test_index], ratings.col[test_index])),
                      shape=ratings.shape, dtype=ratings.dtype)

    test.data[test.data < 0] = 0
    test.eliminate_zeros()

    return train, test

In [None]:
house_user_sparse_train, house_user_sparse_test = train_test_split_imp(sparse.coo_matrix((df_grp.event_stamp, (df_grp.house_id, df_grp.user_id))),0.8,297)

user_house_sparse_train, user_house_sparse_test = train_test_split_imp(sparse.coo_matrix((df_grp.event_stamp, (df_grp.user_id, df_grp.house_id))),0.8,297)

In [None]:
model.fit(house_user_sparse_train*alpha, show_progress=False)

## Manual Attempt at GridSearch to Find Optimal hyperparameters

In [None]:
results6 = []
for f in [6,7,8,9]:
  for reg in [0.4,0.41,0.42,0.43,0.45,0.46]:
    for alpha in [0.6,0.7,0.8,0.9,1]:
      model = implicit.als.AlternatingLeastSquares(factors=f, regularization=reg, iterations=20,use_gpu=False)
      model.fit(house_user_sparse_train*alpha, show_progress=False)

      mapk = evaluation.mean_average_precision_at_k(model,user_house_sparse_train, user_house_sparse_test,10, show_progress=False)
      results6.append((f,reg,alpha,mapk))
      print((f,reg,alpha,mapk))

(6, 0.4, 0.6, 0.08442246365061351)
(6, 0.4, 0.7, 0.08363489360935444)
(6, 0.4, 0.8, 0.06803663765922571)
(6, 0.4, 0.9, 0.07143352611570555)
(6, 0.4, 1, 0.08356770504897638)
(6, 0.41, 0.6, 0.07372642019350316)
(6, 0.41, 0.7, 0.06890078013801065)
(6, 0.41, 0.8, 0.0883889159144551)
(6, 0.41, 0.9, 0.07686318757544645)
(6, 0.41, 1, 0.0684731005531233)
(6, 0.42, 0.6, 0.07766111741707664)
(6, 0.42, 0.7, 0.07747516650751618)
(6, 0.42, 0.8, 0.07782327081419027)
(6, 0.42, 0.9, 0.08058550889141135)
(6, 0.42, 1, 0.08720733563951502)
(6, 0.43, 0.6, 0.09058102715168552)
(6, 0.43, 0.7, 0.08407225735545829)
(6, 0.43, 0.8, 0.07614340666270296)
(6, 0.43, 0.9, 0.07638337116912607)
(6, 0.43, 1, 0.08050623389726681)
(6, 0.45, 0.6, 0.07393035061167871)
(6, 0.45, 0.7, 0.08046817289155545)
(6, 0.45, 0.8, 0.06747202853899796)
(6, 0.45, 0.9, 0.0747169071942058)
(6, 0.45, 1, 0.0894661099400033)
(6, 0.46, 0.6, 0.08639195178638995)
(6, 0.46, 0.7, 0.07968916094625522)
(6, 0.46, 0.8, 0.08102580040718527)
(6, 0.46, 0

In [None]:
results_df6 = pd.DataFrame(results6,columns=['factors','regularization', 'alpha', 'MAP@10'])
results_df6.sort_values('MAP@10',ascending=False).head()

Unnamed: 0,factors,regularization,alpha,MAP@10
96,9,0.41,0.7,0.094116
71,8,0.42,0.7,0.09386
45,7,0.43,0.6,0.092595
97,9,0.41,0.8,0.091375
52,7,0.45,0.8,0.090857


In [None]:
#FINAL chosen hyperparameters from GridSearch Above

model = implicit.als.AlternatingLeastSquares(factors=9, regularization=0.41, iterations=20, use_gpu=False)
model.fit(house_user_sparse_train*0.7, show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




### Evaluation

https://stats.stackexchange.com/questions/226825/what-metric-should-i-use-for-assessing-implicit-matrix-factorization-recommender

In [None]:
from implicit import evaluation

In [None]:
evaluation.AUC_at_k(model,user_house_sparse_train, user_house_sparse_test,10)

HBox(children=(FloatProgress(value=0.0, max=3518.0), HTML(value='')))




0.5619146736654896

In [None]:
evaluation.mean_average_precision_at_k(model,user_house_sparse_train, user_house_sparse_test,10)

HBox(children=(FloatProgress(value=0.0, max=3518.0), HTML(value='')))




0.08379006540187017

## Predictions (Implicit ALS)

In [None]:
def make_recs(user_id):
  recommended = model.recommend(user_id, user_house_sparse_train, filter_already_liked_items=True)
  
  recommended_df = pd.DataFrame(recommended,columns=['house_id', 'score'])
  recommended_df['rexUrl'] = recommended_df['house_id'].map(url_id)
  recommended_df.drop('house_id',axis=1,inplace=True)

  return recommended_df

In [None]:
make_recs(1)

Unnamed: 0,score,rexUrl
0,0.467996,151-e-7th-ave
1,0.466748,31-grace-ter
2,0.450717,10-stoneleigh-ter
3,0.439218,46-revere-blvd
4,0.401975,73-deerfield-rd
5,0.367727,917-kennedy-blvd
6,0.366701,800-jackson-st-apt-807
7,0.352515,759-avenue-a-apt-16
8,0.336547,232-terrace-avenue
9,0.335989,33-lorrigan-pl


In [None]:
df_grp[df_grp['user_id']==1].sort_values('event_stamp',ascending=False)

Unnamed: 0,ip,rexUrl,event_stamp,user_id,house_id
4,100.1.107.229,911-kennedy-blvd-913,20,1,104
3,100.1.107.229,433-w-1st-ave,17,1,63
2,100.1.107.229,265-custer-ave-apt-116,6,1,44
1,100.1.107.229,106-108-parkview-ave,3,1,4


In [None]:
# https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

## House Similarity

https://github.com/benfred/implicit/blob/19322232149224c43f239c7996738bab86c0566a/implicit/recommender_base.pyx

In [None]:
# Function to return houses deemed "similar" by implicit CF
def similar_house(house_id):

  similars = model.similar_items(house_id)

  similar_df = pd.DataFrame(similars,columns=['house_id', 'score'])
  similar_df['rexUrl'] = similar_df['house_id'].map(url_id)
  similar_df.drop('house_id',axis=1,inplace=True)

  return similar_df

In [None]:
similar_house(104)

Unnamed: 0,score,rexUrl
0,1.0,911-kennedy-blvd-913
1,0.184822,917-kennedy-blvd
2,0.168811,520-avenue-e
3,0.167197,762-howard-ct-e
4,0.165468,22b-james-buchanan-dr
5,0.154268,232-terrace-avenue
6,0.148936,265-custer-ave-apt-116
7,0.146364,2104-vroom-dr
8,0.141838,555-colonial-blvd
9,0.141616,759-avenue-a-apt-16


## Explanations for why the item is liked by the user

https://github.com/benfred/implicit/blob/19322232149224c43f239c7996738bab86c0566a/implicit/cpu/als.py

In [None]:
# Function to return some explainability of users that contributed to a scoring between a user and an item
# "Provides explanations for why the item is liked by the user."
def explainrec(user_id,itemid):

  thisuser = df_grp[df_grp['user_id']==user_id]
  user_matrix = sparse.csr_matrix((thisuser.event_stamp, (thisuser.user_id, thisuser.house_id)))

  return model.explain(user_id,user_matrix,itemid)

In [None]:
total_score, top_contrib, user_weights = explainrec(user_id = 1, item_id = 4)
top_contrib

[(4, 0.4867304402978907),
 (44, 0.2158971615958246),
 (63, 0.132639312946422),
 (104, -0.3194425779174637)]