### Load Libraries

In [101]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/Shared drives/Fall2020DSGirlPower/SI650/project/DATA/TO_data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shared drives/Fall2020DSGirlPower/SI650/project/DATA/TO_data


In [102]:
!pip install rank_bm25 nltk
!pip install pyspark
!pip install annoy
!pip install dynaconf 
!pip install flask_googlemaps
!pip install flask-ngrok



In [103]:
import pandas as pd
import numpy as np
import re
from math import sqrt
import random
from sklearn.metrics.pairwise import cosine_similarity

# PySpark import
from pyspark.mllib.recommendation import ALS, Rating
from pyspark import SparkContext
!export PYSPARK_PYTHON=python3.7

# language processing
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer

# use simhash to find the most similar items
from annoy import AnnoyIndex

# gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

# bm25
from rank_bm25 import BM25Okapi

# flask
from flask import Flask, render_template, request
from flask_googlemaps import GoogleMaps, Map, icons
from dynaconf import FlaskDynaconf
import requests
from flask_ngrok import run_with_ngrok

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [104]:
context = SparkContext('local', 'CF')
context.setLogLevel('OFF')

### Load Data

In [105]:
def converter(instr):
    return np.fromstring(instr[1:-1],sep=' ')

In [106]:
bus = pd.read_csv("business.csv")
data = pd.read_csv('user_res_rating.csv') 
ReviewSummary = pd.read_csv("input_data_ver4.csv",converters={'vector':converter})

# Get the business id information
table = bus[["business_id", "name", "latitude", 'longitude',"rating"]]

## First Step: Content based models

### Word2Vec Reuslts

In [107]:
# use simhash to find the most similar items
from annoy import AnnoyIndex

In [108]:
model = KeyedVectors.load_word2vec_format('word2vec_ver4.vector')

In [109]:
# tokenize data
regEx = re.compile('[^a-z]+')
def tokenize_reviews(text):
  text = text.lower()
  text = regEx.sub(" ", text).strip()
  return text

In [110]:
# remove stop words
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

def remove_stopwords_stem(text):
  word_tokens = word_tokenize(str(text))
  filtered_sentence = list()
  for w in word_tokens:
    w = regEx.sub(" ", w).strip()
    if w not in stop_words and len(w)>0:
      filtered_sentence.append(stemmer.stem(w))  
  return filtered_sentence

In [111]:
# get the sentence vector using average value of word vector.
# if a word not in vocabulary, see it as 0
def sentence_vector(sentence):
  vecs = list()
  for word in sentence:
    try:
      vec = model.wv[word]
    except:
      vec = [0] * 100
    vecs.append(vec)
  vector = np.mean(vecs, axis=0)
  return vector

In [112]:
def keyword_recommend_loc_simhash(input_str, df_old, lat,log):
  lat_min = lat-0.07/2
  lat_max = lat+0.07/2
  log_min = log-0.08/2
  log_max = log+0.08/2 
  bus_id = set(bus[(bus.latitude>=lat_min)&(bus.latitude<=lat_max)&(bus.longitude<=log_max)&(bus.longitude>=log_min)].business_id)
  text_token = tokenize_reviews(input_str)
  text_token_sw_removed = remove_stopwords_stem(text_token)
  vector = sentence_vector(text_token_sw_removed)
  df = df_old[df_old["business_id"].isin(bus_id)]

  # build simhash
  res_vec = list(df.vector)
  f = 100 # dim
  t = AnnoyIndex(f) # length of item vector that will be indexed
  for i in range(len(res_vec)):
    t.add_item(i,res_vec[i])
  t.build(30) # 30 trees

  # Get top 10 recommendations
  indexes = t.get_nns_by_vector(vector, 10)
  result = pd.DataFrame(columns=["business_id"])
  for index in indexes:
    result = result.append({"business_id": df.iloc[index].business_id},ignore_index=True)
  return result

In [113]:
def top_10_loc_hash(kword, df10,lat,log):
  recommend_list = keyword_recommend_loc_simhash(kword, df10,lat,log)
  return recommend_list

In [114]:
def cos_similarity(v1,v2):
  return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2))

### BM25


In [115]:
sentences = [row for row in ReviewSummary["text_sw_removed"]]
bm25 = BM25Okapi(sentences)

In [116]:
def BM25_recommend(input_str, df_old, lat,log):
  lat_min = lat-0.07/2
  lat_max = lat+0.07/2
  log_min = log-0.08/2
  log_max = log+0.08/2 
  bus_id = set(bus[(bus.latitude>=lat_min)&(bus.latitude<=lat_max)&(bus.longitude<=log_max)&(bus.longitude>=log_min)].business_id)
  text_token = tokenize_reviews(input_str)
  text_token_sw_removed = remove_stopwords_stem(text_token)
  vector = sentence_vector(text_token_sw_removed)
  df = df_old[df_old["business_id"].isin(bus_id)]
  sentences = [row for row in df["text_sw_removed"]]
  bm25 = BM25Okapi(sentences)
  docs = bm25.get_top_n(text_token_sw_removed, df.text_sw_removed.values, n=10)
  result = pd.DataFrame(columns=["business_id"])
  for i in range(len(df)):
    if df.iloc[i].text_sw_removed in docs:
      result = result.append({"business_id": df.iloc[i].business_id},ignore_index=True)
  recommendation = result.merge(table, on="business_id")
  return recommendation

In [117]:
restaurents_bm25 = BM25_recommend("Fried chicken", ReviewSummary, 43.66311072883239, -79.39440813687946)
business_id_list_bm25 = restaurents_bm25.business_id.to_list()

  import sys


## Second Step: Collaborative Filtering Models

### Build User Business Matrix

In [118]:
def build_rating_table(data):
    """ Build rating look-up table for user and business pair """
    table = {}
    for user, business, rating in data.toLocalIterator():
        table[(user, business)] = rating
    
    return table


def build_user_sets(data):
    """ Aggregate all businesses to users who have rated them """
    user_rdd = data.map(lambda x: (x[0], x[1])).groupByKey()
    user_sets = {}
    for key, vals in user_rdd.toLocalIterator():
        user_sets[key] = set(vals)
    
    return user_sets


def build_business_sets(data):
    """ Aggregate all users to businesses who have been rated by them """
    business_rdd = data.map(lambda x: (x[1], x[0])).groupByKey()
    business_sets = {}
    for key, vals in business_rdd.toLocalIterator():
        business_sets[key] = set(vals)
    
    return business_sets

### Deal with cold start

In [119]:
def find_median(attri_sets, ratings, case):
    """ Find the median rating from the set """
    ave = []
    if case == 'user':
        for user, businesses in attri_sets.items():
            ave.append(sum(ratings[(user, b)] for b in businesses) / len(businesses))
    elif case == 'business':
        for business, users in attri_sets.items():
            ave.append(sum(ratings[(u, business)] for u in users) / len(users))
    ave.sort()
    return ave[len(ave) // 2]

### Collaborative Filtering

In [120]:
def predict_rating(train_data, test_data, case_num):
    """ Predict user, business rating """
    rating_table = build_rating_table(train_data)
    user_sets = build_user_sets(train_data)
    median_user_rating = find_median(user_sets, rating_table, 'user')
    if case_num == 1:
        return predict_rating_model_based(train_data, test_data, median_user_rating)

    weights = {}
    averages = {}
    business_sets = build_business_sets(train_data)
    median_business_rating = find_median(business_sets, rating_table, 'business')

    if case_num == 2:
        return test_data.map(lambda x: predict_rating_user_based(x[0], x[1],
                             rating_table, user_sets, business_sets, weights,
                             averages, median_user_rating, median_business_rating,
                             case_num))

    return test_data.map(lambda x: predict_rating_item_based(x[0], x[1],
                         rating_table, user_sets, business_sets, weights, averages,
                         median_user_rating, median_business_rating, case_num))

#### 1. ALS algorithm

In [121]:
def predict_rating_model_based(train_data, test_data, median_rating):
    """ Predict rating for user, business pair using model-based CF """
    user_mapping, business_mapping = _create_mapping(train_data, test_data)

    train_data = train_data.map(lambda x: Rating(user_mapping[x[0]], business_mapping[x[1]], x[2]))
    test_data = test_data.map(lambda x: ((x[0], x[1]), 1))
    mapped_test_data = test_data.map(lambda x: (user_mapping[x[0][0]], business_mapping[x[0][1]]))

    # create model
    model = ALS.train(train_data, 5, 10)
    predictions = model.predictAll(mapped_test_data)\
                  .map(lambda x: ((user_mapping[x[0]], business_mapping[x[1]]), x[2]))

    # collect all the cold start
    cold_starts = test_data.subtractByKey(predictions).map(lambda x: (x[0], median_rating))

    return predictions.union(cold_starts)\
           .map(lambda x: (x[0][0], x[0][1], _round_rating_model_based(x[1])))

#### 2. User_based CF

In [122]:
def predict_rating_user_based(user, business, ratings, user_sets, business_sets,
                              user_weights, user_averages, median_user_rating,
                              median_business_rating, case_num):
    """ Predict rating for user, business pair using user-based CF """
    if (user, business) in ratings:
        return (user, business, ratings[(user, business)])

    # cold start for new business
    if business not in business_sets:
        return (user, business, median_business_rating)

    # cold start for new user
    if user not in user_sets:
        return (user, business, median_user_rating)

    weights = _get_neighbor_weights(user, business_sets[business], ratings,
                                   user_sets, user_weights, case_num)

    ave_rating = _get_ave_rating(user, ratings, user_sets, user_averages)
    weighted_rating = _get_weighted_rating(business, weights, ratings, user_sets, user_averages)
    return (user, business, _round_rating_user_based(ave_rating + weighted_rating))

#### 3. Item-based CF

In [123]:
def predict_rating_item_based(user, business, ratings, user_sets, business_sets,
                              business_weights, business_averages, median_user_rating,
                              median_business_rating, case_num):
    """ Predict rating for user, business pair using item-based CF """
    if (user, business) in ratings:
        return (user, business, ratings[(user, business)])

    # cold start for new business
    if business not in business_sets:
        return (user, business, median_business_rating)

    # cold start for new user
    if user not in user_sets:
        return (user, business, median_user_rating)

    weights = _get_neighbor_weights(business, user_sets[user], ratings,
                                   business_sets, business_weights, case_num)

    total_weights = weighted_rating = 0.0
    if weights:
        for neighbor, weight in weights:
            total_weights += abs(weight)
            weighted_rating += ratings[(user, neighbor)] * weight
        weighted_rating = weighted_rating / total_weights
    else:
        if business not in business_averages:
            all_users = business_sets[business]
            average_rating = sum(ratings[(u, business)] for u in all_users) / len(all_users)
            business_averages[business] = average_rating
        weighted_rating = business_averages[business]

    return (user, business, _round_rating_item_based(weighted_rating))

### Supportive functions

In [124]:
def _create_mapping(train_data, test_data):
    """ Create two-way mapping between ID and indices """
    user_mapping = {}
    business_mapping = {}
    user_counter = business_counter = 0
    for user, business, _ in train_data.toLocalIterator():
        if user not in user_mapping:
            user_mapping[user] = user_counter
            user_mapping[user_counter] = user
            user_counter += 1
        if business not in business_mapping:
            business_mapping[business] = business_counter
            business_mapping[business_counter] = business
            business_counter += 1

    for user, business, _ in test_data.toLocalIterator():
        if user not in user_mapping:
            user_mapping[user] = user_counter
            user_mapping[user_counter] = user
            user_counter += 1
        if business not in business_mapping:
            business_mapping[business] = business_counter
            business_mapping[business_counter] = business
            business_counter += 1
    
    return user_mapping, business_mapping

In [125]:
def _get_neighbor_weights(target, neighbors, ratings, attri_sets, prev_weights,
                         case_num):
    """ Calculate weight between target and neighbor """
    all_weights = []
    for neighbor in neighbors:
        pair = (target, neighbor)
        weight = 0.0
        if pair in prev_weights:
            weight = prev_weights[pair]
        else:
            weight = _get_weight(target, neighbor, ratings, attri_sets, case_num)
            prev_weights[pair] = weight

    return all_weights


def _get_weight(target, neighbor, ratings, attri_sets, case_num):
    """ Calculate weight for both users """
    co_rated = attri_sets[target].intersection(attri_sets[neighbor])
    vec_1 = []
    vec_2 = []
    if case_num == 2:
        vec_1 = [ratings[(target, attri)] for attri in co_rated]
        vec_2 = [ratings[(neighbor, attri)] for attri in co_rated]
    elif case_num == 3:
        vec_1 = [ratings[(attri, target)] for attri in co_rated]
        vec_2 = [ratings[(attri, neighbor)] for attri in co_rated]
    else:
        raise ValueError("Wrong case number")

    weight = 0.0
    num = len(co_rated)
    if num > 1:
        # find average
        ave_1 = sum(vec_1) / num
        ave_2 = sum(vec_2) / num

        # normalize vector
        vec_1 = [n - ave_1 for n in vec_1]
        vec_2 = [n - ave_2 for n in vec_2]
        numerator = sum(vec_1[i] * vec_2[i] for i in range(num))

        # numerator is 0
        if not numerator:
            return 0.0

        # calculate magnitude of both vectors
        denominator = sqrt(sum(n * n for n in vec_1) * sum(n * n for n in vec_2))
        weight = numerator / denominator

    return weight


def _get_ave_rating(target, ratings, user_sets, averages):
    """ Calculate target average rating """
    if target not in averages:
        attri = user_sets[target]
        sum_ratings = sum(ratings[(target, a)] for a in attri)
        n_ratings = len(attri)
        averages[target] = (sum_ratings, n_ratings)
    return averages[target][0] / averages[target][1]


def _get_weighted_rating(business, weights, ratings, user_sets, user_averages):
    """ Calculate the weighted rating of neighbors """
    # if no similar neighbors
    if not weights:
        return 0.0

    # find weighted rating of each neighbor
    total_weights = weighted_rating = 0.0

    for neighbor, weight in weights:
        if neighbor not in user_averages:
            neighbor_b = user_sets[neighbor]
            sum_ratings = sum(ratings[(neighbor, b)] for b in neighbor_b)
            n_ratings = len(neighbor_b)
            user_averages[neighbor] = (sum_ratings, n_ratings)
        neighbor_ave_rating = (user_averages[neighbor][0] - ratings[(neighbor, business)]) / (user_averages[neighbor][1] - 1)

        weighted_rating += (ratings[(neighbor, business)] - neighbor_ave_rating) * weight
        total_weights += abs(weight)

    return weighted_rating / total_weights


def _round_rating_model_based(rating):
    """ Round rating to reasonable value """
    if rating > 5.0:
        return 5.0

    if rating < 2.0:
        return 2.0

    return rating


def _round_rating_user_based(rating):
    """ Round rating to reasonable value """
    if rating > 4.5:
        return 4.5

    if rating < 2.0:
        return 2.0

    return rating


def _round_rating_item_based(rating):
    """ Round rating to reasonable value """
    if rating > 4.0:
        return 4.0

    if rating < 2.0:
        return 2.0

    return rating

### Utility functions

In [126]:
def parse_csv(data, header=False):
    """ Parse CSV data in RDD """
    data = data.map(lambda x: x.split(','))
    if header:
        header = data.first()
        data = data.filter(lambda x: x != header)\
               .map(lambda x: (x[0], x[1], float(x[2])))
    return data


## Get rank

In [127]:
# PYSPARK_PYTHON=/usr/bin/python3
# PYSPARK_DRIVER_PYTHON=ipython3

In [128]:
def get_rank(business_id_list,user_id,data):
    user = [user_id]
    user = np.repeat(user,10).tolist()
    rating = np.repeat([5],10).tolist()

    test_list = []
    for i in business_id_list:
        pair = [user_id,i]
        test_list.append(pair)

    d = {'user_id':user,'business_id':business_id_list,'rating':rating}
    val_df = pd.DataFrame(data = d)

    for pair in test_list:
        index = data[(data['user_id']== pair[0]) & (data['business_id']== pair[1])].index
        data.drop(index, inplace = True)
       
    train_df = data
    val_df.to_csv('val_data.csv',index = False)
    train_df.to_csv('train_data.csv',index = False)

    train_file = 'train_data.csv'
    test_file = 'val_data.csv'
    output_file = 'result.csv'
    train_data = parse_csv(context.textFile(train_file), header=True)
    test_data = parse_csv(context.textFile(test_file), header=True)

    case_num=3
    predictions = predict_rating(train_data, test_data, case_num)

    user = []
    business = []
    rating = []
    for u, b, r in predictions.toLocalIterator():
        user.append(u)
        business.append(b)
        rating.append(r)
    d = {'user':user,'business_id':business,'rating':rating}
    pred = pd.DataFrame(data = d).sort_values(by = 'rating',ascending=False)
    return pred

In [129]:
def get_location(df):
    rec = df.merge(table, on="business_id")
    rank = []
    for i in range(1, len(rec) + 1):
        rank.append(f'No. {i} - ')
    rec['rank']=rank
    rec['name']=rec['rank']+rec['name']
    records = rec[["latitude", 'longitude',"name"]].to_records(index=False)
    result = list(records)
    return result

## **Interface**

In [130]:
lat = 43.6532
lng = -79.3832
user_id = 'OKX-V5j_qfuOebeHeWGrHg'
query = 'Chinese Food'

# bm25 result
restaurents_bm25 = BM25_recommend(query, ReviewSummary, lat, lng)
business_id_list_bm25 = restaurents_bm25.business_id.to_list()  
df = get_rank(business_id_list_bm25,user_id,data)
result_bm25 = get_location(df)
# w2v
restaurents_w2v = top_10_loc_hash(query, ReviewSummary, lat, lng)
business_id_list_w2v = restaurents_w2v.business_id.to_list()
df2 = get_rank(business_id_list_w2v,user_id,data)
result_w2v = get_location(df2)



  import sys
  import sys
  from ipykernel import kernelapp as app


In [131]:
# # from google.colab.output import eval_js
# # print(eval_js("google.colab.kernel.proxyPort(8080)"))
# !python -m http.server 8080

In [132]:
context.stop()
app = Flask(__name__, template_folder='templates')
GoogleMaps(app)
FlaskDynaconf(app)
run_with_ngrok(app)

@app.route("/")
def map_created_in_view():

    gmap = Map(
        identifier="gmap",
        varname="gmap",
        lat=43.6532,
        lng=-79.3832,
        # markers={
        #     icons.dots.green: [(43.6532, -79.3832), (43.5532, -79.50)],
        #     icons.dots.blue: [(43.6232, -79.3532, "Hello World")],
        # },
        style="height:500px;width:800px;margin:0;",
    )

    return render_template("simple.html", gmap=gmap)

@app.route('/recommendation', methods=['POST'])
def give_rec():
    # lat = float(request.form['lat'])
    # lng = float(request.form['lng'])
    # user_id = request.form['user']
    # query = request.form['query']

    # # bm25 result
    # restaurents_bm25 = BM25_recommend(query, ReviewSummary, lat, lng)
    # business_id_list_bm25 = restaurents_bm25.business_id.to_list()  
    # df = get_rank(business_id_list_bm25,user_id,data)
    # result_bm25 = get_location(df)
    # # w2v
    # restaurents_w2v = top_10_loc_hash(query, ReviewSummary, lat, lng)
    # business_id_list_w2v = restaurents_w2v.business_id.to_list()
    # df2 = get_rank(business_id_list_w2v,user_id,data)
    # result_w2v = get_location(df2)

    gmap = Map(
        identifier="gmap",
        varname="gmap",
        lat=lat,
        lng=lng,
        markers={
            icons.dots.green: result_bm25,
            icons.dots.blue: result_w2v,
        },
        style="height:500px;width:800px;margin:0;",
    )
    return render_template("simple.html", gmap=gmap)



In [None]:
if __name__ == "__main__":
    app.run()


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://6541893e642e.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [15/Dec/2020 15:21:41] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Dec/2020 15:21:44] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [15/Dec/2020 15:21:49] "[37mPOST /recommendation HTTP/1.1[0m" 200 -
