In [None]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from sklearn.metrics import pairwise_distances

from scipy.spatial.distance import cdist

import os
import re
import string
import pickle

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer

## Generate Recommendation List From CB Filtering

In [None]:
df = pd.read_csv('/content/drive/MyDrive/yelp/filted_nv.csv')
train_index = df[df.date < '2019-06-30'].index
content_based_df = df.loc[train_index]

In [None]:
def clean_text(text):
## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

def get_positive_reviews(x):
  positive_reviews = x[x.stars_r >= 3].clean_text
  if len(positive_reviews) > 0:
    positive_reviews = positive_reviews.to_list()
  else:
    positive_reviews = x.sort_values(by='stars_r',ascending=False)
    positive_reviews = positive_reviews.clean_text.head(3).to_list()
  positive_reviews = ' '.join(positive_reviews)
  return positive_reviews

def get_positive_categories(x):
  positive_categories = x[x.stars_r >= 3].categories
  if len(positive_categories) > 0:
    positive_categories = positive_categories.to_list()
  else:
    positive_categories = x.sort_values(by='stars_r',ascending=False)
    positive_categories = positive_categories.categories.head(3).to_list()
  positive_categories = ', '.join(positive_categories)
  return positive_categories

def get_index_of_busid(x,business_id_list):
  business_ids = x.business_id.to_list()
  index_list = []
  for current_id in business_ids:
    index = business_id_list.index(current_id)
    index_list.append(index)
  return index_list

In [None]:
nltk.download('stopwords')
content_based_df['clean_text'] = content_based_df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorizer_reviews.fit(content_based_df['clean_text'])

In [None]:
vectorizer_categories = CountVectorizer(min_df = 1, max_df = 1., tokenizer = lambda x: x.split(', '))
vectorizer_categories.fit(content_based_df['categories'])

In [None]:
test_index = df[df.date >= '2019-06-30'].index
test_df = df.loc[test_index][(df.loc[test_index].user_id.isin(train_df.user_id.unique())) & (df.loc[test_index].business_id.isin(train_df.business_id.unique()))]
test_user = test_df.user_id.unique()

In [None]:
business_review = content_based_df.groupby('business_id').apply(lambda x: ' '.join(x.clean_text.tolist()))
test_user_review = content_based_df[content_based_df['user_id'].isin(test_user)].groupby('user_id').apply(get_positive_reviews)

In [None]:
business_category = content_based_df[['business_id','categories']].drop_duplicates().set_index('business_id')['categories'][business_review.index]
test_user_category = content_based_df[content_based_df['user_id'].isin(test_user)].groupby('user_id').apply(get_positive_categories)[test_user_review.index]

In [None]:
business_id_list = list(business_review.index)
places_to_exclude = content_based_df[content_based_df['user_id'].isin(test_user)].groupby('user_id').apply(lambda x: get_index_of_busid(x,business_id_list))
places_to_exclude = places_to_exclude[test_user_review.index]

In [None]:
user_id_index = {}
for index,item in enumerate(places_to_exclude.index):
  user_id_index[item] = index

In [None]:
places_to_exclude = places_to_exclude.rename('bus_idx').reset_index().explode('bus_idx')
places_to_exclude['user_idx'] = places_to_exclude['user_id'].apply(lambda x: user_id_index[x])
places_to_exclude_list = [places_to_exclude.user_idx.tolist(),places_to_exclude.bus_idx.tolist()]

In [None]:
dists_review = cdist(vectorizer_reviews.transform(test_user_review).todense(), 
            vectorizer_reviews.transform(business_review).todense(),
            metric='correlation')

In [None]:
dists_category = cdist(vectorizer_categories.transform(test_user_category).todense(), 
            vectorizer_categories.transform(business_category).todense(),
            metric='correlation')

In [None]:
dists_together = (dists_category+dists_review)/2

In [None]:
dists_review[places_to_exclude_list] = 1
sorted_index = np.argsort(dists_together, axis=1)
selected_index = sorted_index[:,:90]

  """Entry point for launching an IPython kernel.


In [None]:
test_user_bus_recommend = {}
business_id = business_review.index
test_user_id = test_user_review.index
for i in range(test_user_id.shape[0]):
  test_user_bus_recommend[test_user_id[i]] = list(business_id[selected_index[i]])

## Predict Rating with K-means Cluster

In [None]:
def generate_business_info(users_items_df,helpful_df):
  business_id_info = {}
  business_id = users_items_df.columns
  for bus_id in business_id:
    current_business = users_items_df[bus_id]
    current_helpful = helpful_df[bus_id]
    index = np.where(current_business>=1)[0]
    current_rating = current_business[index].values
    current_helpful = current_helpful[index].values
    business_id_info[bus_id] = (index,current_rating,current_helpful)
  return business_id_info

def predict(model,users_items_df):
  X = users_items_df.values
  new_matrix = model.predict(X) * (X == 0)
  new_users_items_df = pd.DataFrame(new_matrix, columns = users_items_df.columns, index = users_items_df.index)
  new_users_items_df = new_users_items_df + users_items_df
  return new_users_items_df

def generate_user_id_index(test_user_matrix):
  test_user_id_index = {}
  for i in range(len(test_user_matrix.index)):
    test_user_id_index[test_user_matrix.index[i]] = i
  return test_user_id_index

def predict_rating_helpful_cf(test_df,business_id_info,user_similarity,test_user_id_index,helpful_threhold,num_sim):
  test_user_bus = test_df.to_dict('split')['data']
  test_ratings = []
  for i in test_user_bus:
    user_index = test_user_id_index[i[0]]
    current_sim = 1-user_similarity[user_index][business_id_info[i[1]][0]]
    current_helpful = business_id_info[i[1]][2]
    selected_index = np.where(current_helpful>=helpful_threhold)[0]
    current_rating_list = business_id_info[i[1]][1]
    if selected_index.shape[0]>0:
      current_sim = current_sim[selected_index]
      current_rating = current_rating_list[selected_index]
    selected_index = np.argsort(current_sim)[-num_sim:]
    current_sim = current_sim[selected_index]
    current_rating_list = current_rating_list[selected_index]
    current_rating = (current_sim * current_rating_list).sum()/current_sim.sum()
    test_ratings.append(current_rating)
  return test_ratings

In [None]:
test_df = pd.DataFrame()
test_df['user_id'] = test_user_bus_recommend.keys()
test_df['business_id'] = test_user_bus_recommend.values()
test_df = test_df.explode('business_id')

In [None]:
df_bus_cluster = pd.read_csv('/content/drive/MyDrive/yelp/bus_cluster_km&hrc.csv')
df_reviews_train = pd.read_csv('/content/drive/MyDrive/yelp/reviews_train.csv')
df_reviews_train = df_reviews_train[['user_id','business_id','stars_r','opinion_rating','overal_rating','useful_pred']]
df_reviews_train = df_reviews_train.merge(df_bus_cluster,on='business_id')

In [None]:
users_items_star_df = df_reviews_train.pivot_table(index = 'user_id', columns = 'cluster_kmeans', values = 'stars_r', aggfunc='mean').fillna(0)
users_items_opinion_df = df_reviews_train.pivot_table(index = 'user_id', columns = 'cluster_kmeans', values = 'opinion_rating', aggfunc='mean').fillna(0)
users_items_helpful_df = df_reviews_train.pivot_table(index = 'user_id', columns = 'cluster_kmeans', values = 'useful_pred', aggfunc='mean').fillna(0)

In [None]:
business_id_info_star = generate_business_info(users_items_star_df,users_items_helpful_df)
business_id_info_opinio = generate_business_info(users_items_opinion_df,users_items_helpful_df)

In [None]:
import tensorflow as tf
# load the model trained in "recommend_collaborative_filtering.ipynb"
model_star = tf.keras.models.load_model('/content/drive/MyDrive/yelp/kmeans/auto_encode_km_star')
model_opinion = tf.keras.models.load_model('/content/drive/MyDrive/yelp/kmeans/auto_encode_km_opinion')

In [None]:
users_items_star_df = predict(model_star, users_items_star_df)
users_items_opinion_df = predict(model_opinion, users_items_opinion_df)

In [None]:
test_user_star_matrix = users_items_star_df.loc[list(test_df.user_id.unique())]
user_similarity_star = pairwise_distances(test_user_star_matrix, users_items_star_df, metric='cosine')

In [None]:
test_user_opinion_matrix = users_items_opinion_df.loc[list(test_df.user_id.unique())]
user_similarity_opinion = pairwise_distances(test_user_opinion_matrix, users_items_opinion_df, metric='cosine')

In [None]:
test_user_id_index_star = generate_user_id_index(test_user_star_matrix)
star_ratings = predict_rating_helpful_cf(test_df,business_id_info_star,user_similarity_star,test_user_id_index_star,0.01,50)
test_df['predict_rating_star'] = star_ratings

In [None]:
test_user_id_index_opinion = generate_user_id_index(test_user_opinion_matrix)
opinion_ratings = predict_rating_helpful_cf(test_df,business_id_info_opinion,user_similarity_opinion,test_user_id_index_opinion,0.01,50)
test_df['predict_rating_opinion'] = opinion_ratings

In [None]:
test_df['predict_rating_overal'] = (test_df['predict_rating_star']+test_df['predict_rating_opinion'])/2

In [None]:
for i in [5,10]:
  recommended_df = test_df.groupby('user_id').apply(lambda x: x.sort_values('predict_rating_overal',ascending=False).head(i)['business_id'].to_list()).rename('business_id').reset_index()
  recommended_df.to_csv('/content/drive/MyDrive/yelp/pipeline/final_recommendation_km_{}_v3.csv'.format(i),index=None)

## Evaluate Recommendation System

In [None]:
actual_df = pd.read_csv('/content/drive/MyDrive/yelp/test_rating.csv')
business_feature = pd.read_csv('/content/drive/MyDrive/yelp/business_train_features.csv')

metric_dict = {}
for version in [1,2,3]:
  for num in [5,10]:
    metric_dict[(version*30,num)] = []
    recommend_df = pd.read_csv('/content/drive/MyDrive/yelp/pipeline/final_recommendation_km_{}_v{}.csv'.format(num,version))
    recommend_df['business_id'] = recommend_df['business_id'].apply(eval)
    # diversity
    business_feature_list = business_feature.set_index('business_id').to_dict('split')['data']
    business_feature_dict = {}
    business_id_list = business_feature.business_id.tolist()
    for idx in range(len(business_id_list)):
      business_feature_dict[business_id_list[idx]] = business_feature_list[idx]
    diversity_list = []
    for business_ids in recommend_df['business_id'].values:
      feature_matrix = []
      for business_id in business_ids:
        current_feature = business_feature_dict[business_id]
        feature_matrix.append(current_feature)
      feature_matrix = np.array(feature_matrix)
      dist_matrix = pairwise_distances(feature_matrix,metric='cosine')
      total_sim = 0
      total_count = 0
      for i in range(dist_matrix.shape[0]):
        for j in range(i+1, dist_matrix.shape[0]):
          current_sim = dist_matrix[i,j]
          total_sim += current_sim
          total_count += 1
      diversity_list.append(total_sim/total_count)
    metric_dict[(version*30,num)].append(np.mean(diversity_list))

    # Personalization
    recommend_df = recommend_df.explode('business_id')
    recommend_df['recommended'] = 1
    user_item_df = recommend_df.pivot_table(index = 'user_id', columns = 'business_id', values = 'recommended', aggfunc='mean').fillna(0)
    pairwise_sim = pairwise_distances(user_item_df, metric='cosine')

    total_sim = 0
    total_count = 0
    for i in range(pairwise_sim.shape[0]):
      for j in range(i+1, pairwise_sim.shape[0]):
        current_sim = (1-pairwise_sim[i,j])
        total_sim += current_sim
        total_count += 1
    metric_dict[(version*30,num)].append(total_sim/total_count)

    # Satisfaction
    intersect_df = actual_df[['user_id','business_id','stars_r']].merge(recommend_df,on=['user_id','business_id'])
    num_intersect = len(intersect_df)
    metric_dict[(version*30,num)].append(num_intersect)
    star_above = intersect_df.stars_r.mean()-actual_df.stars_r.mean()
    metric_dict[(version*30,num)].append(star_above)

    # Coverage
    train_df = pd.read_csv('/content/drive/MyDrive/yelp/reviews_train.csv')
    num_total_bus = train_df.business_id.unique().shape[0]
    num_recommend_bus = recommend_df['business_id'].unique().shape[0]
    metric_dict[(version*30,num)].append(num_recommend_bus/num_total_bus)

In [None]:
result = pd.DataFrame(metric_dict,index=['diversity','personalization','number_in_actual','satisfaction','coverage'])
result

Unnamed: 0_level_0,30,30,60,60,90,90
Unnamed: 0_level_1,5,10,5,10,5,10
diversity,0.000138,0.000136,0.000157,0.000154,0.000169,0.000166
personalization,0.013921,0.016082,0.021032,0.02444,0.027368,0.031338
number_in_actual,124.0,250.0,90.0,174.0,75.0,146.0
satisfaction,0.39254,0.386927,0.502038,0.518514,0.604261,0.423804
coverage,0.393256,0.573634,0.283567,0.418647,0.225269,0.340849


In [None]:
result.loc['satisfaction']+actual_df.stars_r.mean()

30  5     4.201613
    10    4.196000
60  5     4.311111
    10    4.327586
90  5     4.413333
    10    4.232877
Name: satisfaction, dtype: float64