# Collaborative Filtering V1: NJ 
### Continuation from CF Alldata Notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# Nikhil's working path
%cd drive/My\ Drive/'Captstone(297)'/data

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1auWpfpINZvmJbl0I50kYmpS29bbnesqp/Captstone(297)/data


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("merged_user_data_rm_suspicious.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Filter to Top Counties in NJ where there are many listings

In [4]:
nj_n_counties = ['Hudson','Bergen','Passaic','Essex','Union','Middlesex','Monmouth','Mercer','Hunterdon','Warren','Sussex','Morris','Somerset']
nj_df = df.copy(deep=True)
nj_df = nj_df[(nj_df['county'].isin(nj_n_counties)) & (nj_df['state']=='NJ')]

In [5]:
nj_df.shape

(119905, 63)

### Bring in Score Generation logic from `AllData` notebook

In [6]:
special_actions_level0 = ['scroll','pageview','ViewContent','view_item','drag','click']

# open-form (questions), (showing, showing-agent) = enter-flow = phone-click
# nav-tab (schools-and-comps), success(favorite), change(payment-calculator)
special_actions_level1 = ['open-form','enter-flow',  'nav-tab', 'change',
                          'success', 'Search', 'view_search_results', 'phone-click']
# 'nav' (insidere-details) = lead, submit = submit-success  (showing, showing-agent, buyer-lead-form)
special_actions_level2 = ['nav', 'lead', 'submit', 'submit-success','add_to_cart', 'InitiateCheckout',]
special_actions_level3 = ['Purchase','purchase']

In [7]:
action_ct = nj_df.action.value_counts()
total_ct = action_ct.sum()
level_weights = []
for i, curr_level in enumerate([special_actions_level0, special_actions_level1, 
                           special_actions_level2, special_actions_level3]):
  # futureproofing index lookup for cases when not all actions are there
  Lx_weight = 100*action_ct[action_ct.index.isin(curr_level)].sum()/total_ct
  level_weights.append(Lx_weight)
  print("Level {0:d}: {1:.4f} %".format(i+1, Lx_weight))

Level 1: 99.3945 %
Level 2: 0.5246 %
Level 3: 0.0742 %
Level 4: 0.0067 %


In [8]:
group_df_pre = nj_df.groupby(by=['ip', 'rexUrl']).agg(scroll_ct = ('action',lambda x: sum(x=="scroll")),
                                                   pageview_ct = ('action',lambda x: sum(x=="pageview")),
                                                   viewcontent_ct = ('action',lambda x: sum(x=='ViewContent')),
                                                   view_item_ct = ('action',lambda x: sum(x=='view_item')),
                                                   drag_ct = ('action',lambda x: sum(x=='drag')),
                                                   click_ct = ('action',lambda x: sum(x=='click')),
                                                   level0_ct = ('action', lambda x: sum(np.isin(x, special_actions_level0))),
                                                   level1_ct = ('action', lambda x: sum(np.isin(x, special_actions_level1))),
                                                   level2_ct = ('action', lambda x: sum(np.isin(x, special_actions_level2))),
                                                   level3_ct = ('action', lambda x: sum(np.isin(x, special_actions_level3))))

In [9]:
def calculate_score(row):
  curr_score = 0
  for i in range(4):
    curr_weight = np.clip(1/level_weights[i],0,6)
    curr_score += row[f'level{i}_ct']*curr_weight
  return min(6, curr_score)

# Based on Selina score rule
group_df_pre['score'] = group_df_pre.apply(lambda x: calculate_score(x), axis=1)

## input for CF model
df_grp = group_df_pre.reset_index()[['ip', 'rexUrl','score']]
print("Input for CF model: ", df_grp.shape)
df_grp.head()

Input for CF model:  (5467, 3)


Unnamed: 0,ip,rexUrl,score
0,10.70.33.53,57-miry-brook-rd,6.0
1,100.1.107.229,106-108-parkview-ave,0.030183
2,100.1.107.229,265-custer-ave-apt-116,0.060366
3,100.1.107.229,433-w-1st-ave,0.171036
4,100.1.107.229,911-kennedy-blvd-913,0.201218


## Modeling

In [None]:
# ! pip install surprise

In [None]:
import surprise
from surprise import Reader, Dataset
from surprise import SVD
from surprise import NMF
from surprise import Dataset
from surprise import accuracy


from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

In [None]:
# to load dataset from pandas df, we need `load_fromm_df` method in surprise lib

cf_df = pd.DataFrame({'itemID': list(df_grp.rexUrl),
                   'userID': list(df_grp.ip),
                   'rating': list(df_grp.score)})
reader = Reader(rating_scale=(0, 6.0))

cf_df = Dataset.load_from_df(cf_df[['userID', 'itemID', 'rating']], reader)

### Gridsearch NMF

In [None]:
param_grid = {'n_factors': [5, 10,15,20], 
              'n_epochs': [50,60,70],
              'reg_pu': [0.04, 0.05, 0.06, 0.07, 0.08],
              'reg_qi': [0.04, 0.05, 0.06, 0.07, 0.08]}


gs = surprise.model_selection.GridSearchCV(NMF, param_grid, measures=['rmse'], cv=3, n_jobs=-1)

gs.fit(cf_df)

In [None]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.sort_values('rank_test_rmse').head(3)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_reg_pu,param_reg_qi
58,0.793032,0.866845,0.944353,0.868076,0.061783,1,0.245246,0.002198,0.009293,0.003376,"{'n_factors': 5, 'n_epochs': 70, 'reg_pu': 0.0...",5,70,0.05,0.07
114,0.798124,0.865474,0.943437,0.869011,0.059377,2,0.277011,0.005318,0.008157,9.4e-05,"{'n_factors': 10, 'n_epochs': 60, 'reg_pu': 0....",10,60,0.06,0.08
216,0.797023,0.86882,0.941545,0.869129,0.059001,3,0.408522,0.012841,0.007494,0.000443,"{'n_factors': 15, 'n_epochs': 70, 'reg_pu': 0....",15,70,0.07,0.05


## GridSearch SVD

In [None]:
param_grid2 = {'n_factors': [80, 90,100,110,120], 
               'n_epochs': [5,10,20,30,40],
              'lr_all': [0.004,0.005,0.006,0.007],
              'reg_all': [0.01, 0.02, 0.03, 0.04, 0.05]}


gs2 = surprise.model_selection.GridSearchCV(SVD, param_grid2, measures=['rmse'], cv=3, n_jobs=-1)

gs2.fit(cf_df)

In [None]:
results_df2 = pd.DataFrame.from_dict(gs2.cv_results)
results_df2.sort_values('rank_test_rmse').head(3)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_lr_all,param_reg_all
210,0.812508,0.907239,0.817044,0.845597,0.043627,1,0.096518,0.001865,0.013679,0.000796,"{'n_factors': 100, 'n_epochs': 5, 'lr_all': 0....",100,5,0.006,0.01
113,0.813051,0.908523,0.815401,0.845658,0.044462,2,0.088104,0.000321,0.013431,0.000168,"{'n_factors': 90, 'n_epochs': 5, 'lr_all': 0.0...",90,5,0.006,0.04
18,0.812619,0.906577,0.817939,0.845712,0.043093,3,0.080954,0.000494,0.013174,0.001202,"{'n_factors': 80, 'n_epochs': 5, 'lr_all': 0.0...",80,5,0.007,0.04


## Testing best Model - NMF with params found with GridSearch above

In [None]:
# define a cross-validation iterator
kf = KFold(n_splits=5)

algo_nj = NMF(n_factors=5,n_epochs=70,reg_pu=0.05,reg_qi=0.07)
# algo_nj = SVD(n_factors=100, n_epochs=5,lr_all=0.006,reg_all=0.01)

for trainset, testset in kf.split(cf_df):

    # train and test algorithm.
    algo_nj.fit(trainset)
    predictions = algo_nj.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8026
RMSE: 0.8464
RMSE: 0.9685
RMSE: 0.8993
RMSE: 0.8798
