## Data preparation

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 9999

In [None]:
review = pd.read_json('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/yelp_academic_dataset_review.json', lines=True, nrows=1000000)

In [None]:
review.dropna(inplace=True)
# review = review.sample(frac=0.1)

### Remove "new" user/item pairs (for pure collaborative filtering)

In [None]:
# Iterative filtering of dense interactions
LENGTH = 5
MAX_ITER = 100
i = 0
n_post = -1
n_prev = 0
for i in range(MAX_ITER):
    print('iter: ', i)
    n_pre = len(review)
    print('n_pre: ', n_pre)
    review = review.groupby('user_id').filter(lambda x: len(x)>LENGTH)
    review = review.groupby('business_id').filter(lambda x: len(x)>LENGTH)
    n_post = len(review)
    print('n_post: ', n_post)
    if n_post == n_pre:
        break
    i+=1

In [None]:
review.to_csv('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/yelp_academic_dataset_review_sample.csv', index=False)

### Join dataset

In [None]:
review = pd.read_csv('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/yelp_academic_dataset_review_sample.csv')

In [None]:
user = pd.read_json('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/yelp_academic_dataset_user.json', lines=True)

In [None]:
review_user = pd.merge(review, user, on = "user_id", how = "left", suffixes=("","_user"))

In [None]:
len(review_user)

In [None]:
business = pd.read_json('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/yelp_academic_dataset_business.json', lines=True)

In [None]:
review_user_business = pd.merge(review_user, business, on = "business_id", how = "left", suffixes=("","_business"))

In [None]:
len(review_user_business)

In [None]:
review_user_business.head()

In [None]:
review_user_business.columns

In [None]:
review_user_business[['user_id', 'business_id', 'stars', 'text',
                      'name', 'average_stars',
                      'name_business', 'stars_business', 'categories', 'state', 'city']].to_parquet('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/data_clean.parquet', index=False)

## Recommendation model 

Different approaches: https://en.wikipedia.org/wiki/Recommender_system

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
from lightfm.evaluation import *
from sklearn.model_selection import train_test_split
from lightfm.data import Dataset
import swifter
pd.options.display.max_columns = 9999

In [None]:
df = pd.read_parquet('/Users/junheyang/Documents/uw2019-2020/CSE583/Project/yelpifydata/yelp_dataset/data_clean.parquet')

In [None]:
df.isnull().sum()

### Feature engineering

In [None]:
def round_of_rating(number):
    """Round a number to the closest half integer.
    >>> round_of_rating(1.3)
    1.5
    >>> round_of_rating(2.6)
    2.5
    >>> round_of_rating(3.0)
    3.0
    >>> round_of_rating(4.1)
    4.0"""

    return round(number * 2) / 2
# bucketize numeric features to reduce dimensions
df['average_stars'] = df['average_stars'].apply(lambda x: round_of_rating(x))
df['stars_business'] = df['stars_business'].apply(lambda x: round_of_rating(x))

In [None]:
# split and onehot encode category tags
df_categories = df['categories'].str.get_dummies(sep=", ")

In [None]:
# only keep categories with more than 1% none-zero rows
df_categories = df_categories[df_categories.columns[df_categories.sum()>len(df)*0.01]]

In [None]:
df_categories.shape

In [None]:
df = pd.concat([df.drop('categories', 1), df_categories], axis=1)

In [None]:
df.head()

In [None]:
train, test = train_test_split(df, test_size=0.2)

### Collaborative filtering

In [None]:
ds = Dataset()

In [None]:
# we call fit to supply userid, item id and user/item features
ds.fit(
        df['user_id'].unique(), # all the users
        df['business_id'].unique(), # all the items
)

In [None]:
# plugging in the interactions and their weights
(train_interactions, train_weights) = ds.build_interactions([(x[0], x[1], x[2]) for x in train.values ])

In [None]:
(test_interactions, test_weights) = ds.build_interactions([(x[0], x[1], x[2]) for x in test.values ])

In [None]:
model = LightFM(no_components=50, loss='warp')
%time model.fit(train_interactions, sample_weight=train_weights, epochs=10, num_threads=10)

In [None]:
# evaluation
train_auc = auc_score(model, train_interactions, num_threads=20).mean()
print('Training set AUC: %s' % train_auc)
test_auc = auc_score(model, test_interactions, num_threads=20).mean()
print('Testing set AUC: %s' % test_auc)

In [None]:
print("Train precision: %.2f" % precision_at_k(model, train_interactions, k=1, num_threads=20).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions, k=1, num_threads=20).mean())

In [None]:
print("Train recall: %.2f" % recall_at_k(model, train_interactions, k=10, num_threads=20).mean())
print("Test recall: %.2f" % recall_at_k(model, test_interactions, k=10, num_threads=20).mean())

### Make prediction for known users

In [None]:
# retrain the model using the full dataset (not looking at train or test)
ds_full = Dataset()
# we call fit to supply userid, item id and user/item features
ds_full.fit(
        df['user_id'].unique(), # all the users
        df['business_id'].unique(), # all the items
)
(interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2]) for x in df.values ])

In [None]:
# same model using the full dataset
model_full = LightFM(no_components=50, loss='warp')
%time model_full.fit(interactions, sample_weight=weights, epochs=10, num_threads=10)

In [None]:
user_id_map, user_feature_map, business_id_map, business_feature_map = ds_full.mapping()

In [None]:
business_id_map

In [None]:
def recommend_known_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing user_id as key and interaction_index as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already visited
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [None]:
df_interactions = pd.DataFrame(weights.todense())

In [None]:
df_interactions.index = list(user_id_map.keys())
df_interactions.columns = list(business_id_map.keys())

In [None]:
user_dict = user_id_map

In [None]:
item_dict = df.set_index('business_id')['name_business'].to_dict()

In [None]:
rec_list_user = recommend_known_user(model = model_full, 
                                      interactions = df_interactions, 
                                      user_id = 'qRWzBX1q07ZuPgaTXB_4JA', 
                                      user_dict = user_dict,
                                      item_dict = item_dict, 
                                      threshold = 3,
                                      nrec_items = 10,
                                      show = True)

### Make prediction for known businesses

In [None]:
df_interactions[:200]

In [None]:
def recommend_known_item(model,interactions,item_id,user_dict,item_dict,number_of_user, show=True):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing user_id as key and interaction_index as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    if show == True:
        print("\n Recommended Users:")
        counter = 1
        for i in user_list:
            print(str(counter) + '- ' + i)
            counter+=1
    return user_list 

In [None]:
rec_list_item = recommend_known_item(model = model_full,
                           interactions = df_interactions,
                           item_id = "VMPSdoBgJuyS9t_x_caTig",
                           user_dict = user_dict,
                           item_dict = item_dict,
                           number_of_user = 15)

### Hybrid recommendation model (collaborative filtering + content-based)

In [None]:
ds = Dataset()
# we call fit to supply userid, item id and user/item features

In [None]:
user_cols = ['user_id', 'average_stars']
categories = [c for c in df.columns if c[0].isupper()]
item_cols = ['business_id', 'stars_business', 'state']

In [None]:
def get_features_meta(df, cols):
    results = []
    for col in cols:
        for v in df[col].unique():
            results.append(col+':'+str(v))
    return results

In [None]:
user_features = get_features_meta(df, user_cols[1:])

In [None]:
item_features = get_features_meta(df, item_cols[1:])

In [None]:
ds.fit(
        df['user_id'].unique(), # all the users
        df['business_id'].unique(), # all the items
        user_features = user_features, # additional user features
        item_features = item_features
)

In [None]:
# Helper function that takes the user features and converts them into the proper "feature:value" format
def get_features_tuple(row, id_col):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = {'uid': 'id', f1':1, 'f2': 1, 'f3':0, 'loc':'del'},
    resultant output =(id, ['f1:1', 'f2:1', 'f3:0', 'loc:del'])
   
    """
    return (row[id_col], [k+':'+str(v) for k, v in row.items() if k!=id_col])

In [None]:
train_user_features = train[user_cols].apply(lambda x: get_features_tuple(x, 'user_id'), axis=1).tolist()
train_user_features = ds.build_user_features(train_user_features, normalize= False)

test_user_features = test[user_cols].apply(lambda x: get_features_tuple(x, 'user_id'), axis=1).tolist()
test_user_features = ds.build_user_features(test_user_features, normalize= False)

In [None]:
train_item_features = train[item_cols].apply(lambda x: get_features_tuple(x, 'business_id'), axis=1).tolist()
train_item_features = ds.build_item_features(train_item_features, normalize= False)

test_item_features = test[item_cols].apply(lambda x: get_features_tuple(x, 'business_id'), axis=1).tolist()
test_item_features = ds.build_item_features(test_item_features, normalize= False)

In [None]:
model = LightFM(no_components=50, loss='warp')
%time model.fit(train_interactions, user_features=train_user_features, item_features=train_item_features, epochs=10, num_threads=10)

In [None]:
train_auc = auc_score(model, train_interactions, user_features=train_user_features, item_features=train_item_features, num_threads=20).mean()
print('Training set AUC: %s' % train_auc)
test_auc = auc_score(model, test_interactions, user_features=test_user_features, item_features=test_item_features, num_threads=20).mean()
print('Testing set AUC: %s' % test_auc)

In [None]:
print("Train precision: %.2f" % precision_at_k(model, train_interactions, user_features=train_user_features, item_features=train_item_features, k=5, num_threads=20).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions, user_features=test_user_features, item_features=test_item_features, k=5, num_threads=20).mean())