In [2]:
from google.colab import drive 
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
from collections import Counter, defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import sys
import json
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import time
import pickle
import csv
import os.path

In [0]:


my_path = ''

def append_to_data(data, profile, predict_on):
    profile = json.loads(profile)
    dish_ids = list(map(int, profile.keys()))
    ratings = list(map(int, profile.values()))

    d = pd.DataFrame(columns = ['dishId', 'userId', 'rating'])
    d['dishId'] = dish_ids
    d['rating'] = ratings
    d['userId'] = predict_on

    data = data.append(d)
    return data

def tokenize_string(my_string):
    return re.findall('[\w\-]+', my_string.lower())


def tokenize(db):
    """
    The meta tags associated with each dish is broken down (tokenized) as a list of tags
    Eg: egg|flour|ghee|paratha will be tokenized as [egg, flour, ghee, paratha]
    """
    tokenlist=[]
    for index,row in db.iterrows():
        tokenlist.append(tokenize_string(row.tags))
    db['tokens']=tokenlist
    return db

def featurize(db, include_flavours):
    """
    Each row will contain a csr_matrix of shape (1, num_features).
    Each entry in this matrix will contain the tf-idf value of the term
    Formula : tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document 
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents
    """
    def tf(word, doc):
        return doc.count(word) / Counter(doc).most_common()[0][1]

    def df(word, doclist):
        return sum(1 for d in doclist if word in d)

    def tfidf(word, doc, dfdict, N):
        return tf(word, doc) * math.log10((N / dfdict[word]))

    def getcsrmatrix(tokens,dfdict,N,vocab, dish_flavours, max_vocab):
        matrixRow_list = []
        if include_flavours:
            matrixRow_list = np.zeros((1,len(vocab) + len(dish_flavours) - 1),dtype='float')
        else:
            matrixRow_list = np.zeros((1,len(vocab)),dtype='float')
        for t in tokens:
            if t in vocab:
                matrixRow_list[0][vocab[t]] = tfidf(t,tokens,dfdict,N)

        if include_flavours:
            matrixRow_list[0][max_vocab] = dish_flavours['bitter']
            matrixRow_list[0][max_vocab] = dish_flavours['rich']
            matrixRow_list[0][max_vocab + 1] = dish_flavours['salt']
            matrixRow_list[0][max_vocab + 3] = dish_flavours['spicy']
            matrixRow_list[0][max_vocab + 2] = dish_flavours['sweet']
            matrixRow_list[0][max_vocab + 5] = dish_flavours['umami']

        return csr_matrix(matrixRow_list)

    flavour = pd.read_csv(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tastes.csv'), names = ['dishId', 'bitter', 'rich', 'salt', 'spicy', 'sweet', 'umami'])

    N=len(db)

    doclist = db['tokens'].tolist()
    vocab = { i:x for x,i in enumerate(sorted(list(set(i for s in doclist for i in s)))) }
    max_vocab = max(vocab.values()) + 1
    print("max vocab",max_vocab)
    dfdict = {}
    for v in vocab.items():
        dfdict[v[0]] = df(v[0],doclist)

    csrlist = []
    for index, row in db.iterrows():
        dish_flavours = flavour[flavour.dishId == row['dishId']].to_dict(orient = 'record')[0]
        csrlist.append(getcsrmatrix(row['tokens'],dfdict,N,vocab, dish_flavours, max_vocab)) # row['dishId'] and df with flavour scores

    db['features'] =  csrlist
    return (db,vocab)


def my_train_test_split(ratings):
    """
    Returns a random split of the ratings matrix into a training and testing set.
    """
    train_set, test_set = train_test_split(ratings, test_size = 0.20, random_state = 42)
    return train_set, test_set
    '''
    test = set(range(len(ratings))[::10])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]
    '''

def cosine_sim(a, b, include_flavours):
    """
    """

    v1 = a.toarray()[0]
    v2  = b.toarray()[0]
    def cos_sim(v1, v2):
        x = (math.sqrt(sum([i*i for i in v1]))*math.sqrt(sum([i*i for i in v2])))
        if x:
            return sum(i[0] * i[1] for i in zip(v1, v2)) / x
        else:
            return 0
    # s1 = cos_sim(v1, v2)
    # return s1
    s1 = cos_sim(v1[:-6], v2[:-6])
    if include_flavours:
        s2 = cos_sim(v1[-6:], v2[-6:])
        return s1 * 0.5 + s2 * 0.5
    else:
        return s1

def make_predictions(db, ratings_train, ratings_test, include_flavours):
    """
    Using the ratings in ratings_train, prediction is made on the ratings for each
    row in ratings_test.
    This is done by computing the weighted average
    rating for every other dish that the user has rated.
    """
    result = []
    x = 0
    for index,row in ratings_test.iterrows():
        # mlist contains dishIds rated by the user in the train set
        mlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['dishId'])
        # csr list contains tfidf scores of tags for dishes rated by the user
        csrlist = list(db.loc[db['dishId'].isin(mlist)]['features'])
        # mrlist contains scores of dishes rated by the user (dishes in mlist)
        mrlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['rating'])
        # computing similarity between dishes user rated and the current dish in the test set

        sim = [cosine_sim(c,db.loc[db['dishId'] ==row['dishId']]['features'].values[0], include_flavours) for c in csrlist]
        # computing similarity times the rating for known dish
        wan = sum([ v*mrlist[i] for i,v in enumerate(sim) if v>0])
        wadlist = [i for i in sim if i>0]
        ## check for sum(wadlist) > 1
        if len(wadlist)>0 and sum(wadlist) >= 1:
            result.append(wan/sum(wadlist))
            x = x + 1
        else:
            result.append(np.mean(mrlist)) # if dish did not match with anything approx as average of users rating
    return np.array(result)

def main(data, db, predict_on, include_flavours):
    """
    """
    total_dishes = db.shape[0]

    db = tokenize(db)
    db, vocab = featurize(db, include_flavours)
    
    ratings_train, ratings_test = my_train_test_split(data)
    predictions = make_predictions(db, ratings_train, ratings_test, include_flavours)

    predicted_test_error = mean_squared_error(ratings_test.rating, predictions) ** 0.5

    def predict_on_user(predict_on):
        ratings_test = pd.DataFrame(columns = ['userId', 'dishId'])
        ratings_test['userId'] = [predict_on] * total_dishes
        ratings_test.dishId = range(1, total_dishes + 1)
           
        predictions_uid = make_predictions(db, ratings_train, ratings_test, include_flavours)

        predictions_uid = list(enumerate(predictions_uid))

        predictions_uid = sorted(predictions_uid, key = lambda x: x[1], reverse = True)

        predictions_uid = list(map(lambda x: (x[0] + 1, x[1]), predictions_uid))

        return predictions_uid

    return (predicted_test_error, predict_on_user(predict_on = predict_on))
    

def start(profile = None, type = 'all', predict_on = 100, flavours = False, retrain = False):
    time_start = time.time()
    data = pd.read_csv(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/review.csv'))
    data = data[data['userId'].isin(data['userId'].value_counts()[data['userId'].value_counts() >= 5].index)]
    print(data.shape)

    if not retrain:
        if flavours:
            final_scores = pickle.load(open(os.path.join(my_path,"/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_flavour_scores.pickle"), "rb" ))
            predictions = final_scores[predict_on]

        else:
            final_scores = pickle.load(open(os.path.join(my_path,"/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_scores.pickle"), "rb" ))
            predictions = final_scores[predict_on]

        predicted_test_error = None

    else:
        if profile:
            data = append_to_data(data, profile, predict_on)

        if type == 'all':
            db = pd.read_csv(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/meta_cuisine.csv'))
        elif type == 'meta':
            db = pd.read_csv(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/database.csv'), names = ['dishId', 'tags'])

        dishes = pd.read_csv(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/id_name_mapping.csv'), names = ['dishId', 'dish_name'])

        predicted_test_error, predictions = main(data, db, predict_on = predict_on, include_flavours = flavours)
        
        predictions = pd.DataFrame(predictions, columns = ['dishId', 'rating'])
        predictions = predictions.merge(dishes, on = 'dishId', how = 'left')
        predictions.columns = ['dishId', 'rating', 'dishName']

        if flavours:
            if os.path.exists(os.path.join(my_path,"/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_flavour_scores.pickle")):
                final_scores = pickle.load(open(os.path.join(my_path,"/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_flavour_scores.pickle"), "rb" ))
                final_scores[predict_on] = predictions
            else:
                final_scores = {}
                final_scores[predict_on] = predictions

            pickle.dump(final_scores, open(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_flavour_scores.pickle'), 'wb'))

        else:
            if os.path.exists(os.path.join(my_path,"/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_scores.pickle")):
                final_scores = pickle.load(open(os.path.join(my_path,"/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_scores.pickle"), "rb" ))
                final_scores[predict_on] = predictions

            else:
                final_scores = {}
                final_scores[predict_on] = predictions

            pickle.dump(final_scores, open(os.path.join(my_path,'/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/tfidf_final_scores.pickle'), 'wb'))

    data = data[data.userId == predict_on]
    original_rating = data.merge(predictions, how = 'left', on = 'dishId')
    original_rating.columns = ['dishId', 'userId', 'rating', 'reformed', 'dishName']
    
    time_end = time.time()

    answer = {"user" : predict_on, "predicted_test_error": predicted_test_error, "time" : round(time_end - time_start, 2), "predicted_rating" : predictions, "original_rating" : original_rating}
    return answer



In [5]:
start(retrain=True)

(4019, 3)
max vocab 382


{'original_rating':    dishId  userId  rating  reformed             dishName
 0     276     100       5  4.333195          mango lassi
 1      15     100       5  3.923997        chicken curry
 2     176     100       4  4.444444        nariyal burfi
 3      16     100       5  4.527952      chicken makhani
 4     150     100       5  4.579875     strawberry lassi
 5      13     100       5  4.756489     vegetarian korma
 6      31     100       5  4.511668          mango lassi
 7     206     100       3  4.561917                pulao
 8      92     100       3  4.201004  curried cauliflower
 9     245     100       4  4.511552          mango lassi,
 'predicted_rating':       dishId    rating                               dishName
 0         54  4.881832                           saffron rice
 1         13  4.756489                       vegetarian korma
 2        341  4.713186  saffron rice with raisins and cashews
 3       1117  4.702834                           mexican rice
 4     