In [1]:
from collections import Counter, defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import sys
import json
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import time
import pickle
import csv
import os.path

In [2]:
def append_to_data(data, profile, predict_on):
    profile = json.loads(profile)
    dish_ids = list(map(int, profile.keys()))
    ratings = list(map(int, profile.values()))

    d = pd.DataFrame(columns = ['dishId', 'userId', 'rating'])
    d['dishId'] = dish_ids
    d['rating'] = ratings
    d['userId'] = predict_on

    data = data.append(d)
    return data

In [21]:


def tokenize(db):
    
    tokenlist=[]
    for index,row in db.iterrows():
        tokenlist.append(row.tags.lower().split("|"))
    db['tokens']=tokenlist
    return db
def featurize(db, include_flavours):
    
    def tf(word, doc):
        return doc.count(word) / Counter(doc).most_common()[0][1]

    def df(word, doclist):
        return sum(1 for d in doclist if word in d)

    def tfidf(word, doc, dfdict, N):
        return tf(word, doc) * math.log10((N / dfdict[word]))

    def getcsrmatrix(tokens,dfdict,N,vocab, dish_flavours, max_vocab):
        matrixRow_list = []
        if include_flavours:
            matrixRow_list = np.zeros((1,len(vocab) + len(dish_flavours) - 1),dtype='float')
        else:
            matrixRow_list = np.zeros((1,len(vocab)),dtype='float')
        for t in tokens:
            if t in vocab:
                matrixRow_list[0][vocab[t]] = tfidf(t,tokens,dfdict,N)

        if include_flavours:
            matrixRow_list[0][max_vocab] = dish_flavours['bitter']
            matrixRow_list[0][max_vocab] = dish_flavours['rich']
            matrixRow_list[0][max_vocab + 1] = dish_flavours['salt']
            matrixRow_list[0][max_vocab + 3] = dish_flavours['spicy']
            matrixRow_list[0][max_vocab + 2] = dish_flavours['sweet']
            matrixRow_list[0][max_vocab + 5] = dish_flavours['umami']

        return csr_matrix(matrixRow_list)

    flavour = pd.read_csv('../dataset/Utilities/tastes.csv', names = ['dishId', 'bitter', 'rich', 'salt', 'spicy', 'sweet', 'umami'])
    print("flavour shape= ",flavour.shape)
    print(flavour.head())
    
    N=len(db)

    doclist = db['tokens'].tolist()
    #print("tokens = ",doclist[:6])
    
    vocab = { i:x for x,i in enumerate(sorted(list(set(i for s in doclist for i in s)))) }
    #print(vocab)
    max_vocab = max(vocab.values()) + 1
    print("max vocab= ",max_vocab)
    
    dfdict = {}
    for v in vocab.items():
        dfdict[v[0]] = df(v[0],doclist)

    csrlist = []
    for index, row in db.iterrows():
        dish_flavours = flavour[flavour.dishId == row['dishId']].to_dict(orient = 'record')[0]
        csrlist.append(getcsrmatrix(row['tokens'],dfdict,N,vocab, dish_flavours, max_vocab)) # row['dishId'] and df with flavour scores

    db['features'] =  csrlist
    print("after including ifidf features")
    print(db.head())
    return (db,vocab)

def my_train_test_split(ratings):
    
    train_set, test_set = train_test_split(ratings, test_size = 0.20, random_state = 42)
    return train_set, test_set


def cosine_sim(a, b, include_flavours):
    """
    """

    v1 = a.toarray()[0]
    v2  = b.toarray()[0]
    def cos_sim(v1, v2):
        x = (math.sqrt(sum([i*i for i in v1]))*math.sqrt(sum([i*i for i in v2])))
        if x:
            return sum(i[0] * i[1] for i in zip(v1, v2)) / x
        else:
            return 0
    # s1 = cos_sim(v1, v2)
    # return s1
    '''
    s1 = cos_sim(v1[:-6], v2[:-6])
    if include_flavours:
        s2 = cos_sim(v1[-6:], v2[-6:])
        return s1 * 0.5 + s2 * 0.5
    else:
        return s1
    '''
    s1=cos_sim(v1,v2)
    return s1

def make_predictions(db, ratings_train, ratings_test, include_flavours):
    
    result = []
    x = 0
    for index,row in ratings_test.iterrows():
        # mlist contains dishIds rated by the user in the train set
        mlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['dishId'])
        #print("dishes rated by user ",row["userId"]," ",mlist)
        # csr list contains tfidf scores of tags for dishes rated by the user
        csrlist = list(db.loc[db['dishId'].isin(mlist)]['features'])
        #print("csrlist ",csrlist)
        # mrlist contains scores of dishes rated by the user (dishes in mlist)
        mrlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['rating'])
        #print("mrlist ",mrlist)
        # computing similarity between dishes user rated and the current dish in the test set
        l=[0]*len(db["dishId"])
        for i,ele in enumerate(db["dishId"]):
            if(int(ele)==int(row["dishId"])):
                l[i]=1
        sim = [cosine_sim(c,db.loc[l]['features'].values[0], include_flavours) for c in csrlist]
        # computing similarity times the rating for known dish
        wan = sum([ v*mrlist[i] for i,v in enumerate(sim) if v>0])
        wadlist = [i for i in sim if i>0]
        ## check for sum(wadlist) > 1
        if len(wadlist)>0 and sum(wadlist) >= 1:
            result.append(wan/sum(wadlist))
            x = x + 1
        else:
            result.append(np.mean(mrlist)) # if dish did not match with anything approx as average of users rating
    return np.array(result)

def main(data, db, predict_on, include_flavours):
    """
    """
    total_dishes = db.shape[0]
    print("In main total dishes= ",total_dishes)
    
    db = tokenize(db)
    print("after tokenizing db= ",db.shape)
    print(db.head())
    
    db, vocab = featurize(db, include_flavours)
    
    ratings_train, ratings_test = my_train_test_split(data)
    print("trainig shape= ",ratings_train.shape)
    print("testing shape= ",ratings_test.shape)

    predictions = make_predictions(db, ratings_train, ratings_test, include_flavours)

    predicted_test_error = mean_squared_error(ratings_test.rating, predictions) ** 0.5

    def predict_on_user(predict_on):
        ratings_test = pd.DataFrame(columns = ['userId', 'dishId'])
        ratings_test['userId'] = [predict_on] * total_dishes
        ratings_test.dishId = range(1, total_dishes + 1)
           
        predictions_uid = make_predictions(db, ratings_train, ratings_test, include_flavours)

        predictions_uid = list(enumerate(predictions_uid))

        predictions_uid = sorted(predictions_uid, key = lambda x: x[1], reverse = True)

        predictions_uid = list(map(lambda x: (x[0] + 1, x[1]), predictions_uid))

        return predictions_uid

    return (predicted_test_error, predict_on_user(predict_on = predict_on))

In [22]:
def start(profile = None, type = 'meta', predict_on = 9974, flavours = False, retrain = False):
    time_start = time.time()
    data = pd.read_csv('../dataset/Utilities/newReview.csv')
    data=data.mask(data.eq('None')).dropna()
    print("before elimination =",data.shape)
    print(data.head())
    data = data[data['userId'].isin(data['userId'].value_counts()[data['userId'].value_counts() >= 5].index)]
    print("after elimination =",data.shape)

    if not retrain:
        if flavours:
            final_scores = pickle.load(open("../dataset/Utilities/tfidf_final_flavour_scores.pickle"), "rb" )
            predictions = final_scores[predict_on]

        else:
            final_scores = pickle.load(open("../dataset/Utilities/tfidf_final_scores.pickle"), "rb" )
            predictions = final_scores[predict_on]

        predicted_test_error = None

    else:
        if profile:
            data = append_to_data(data, profile, predict_on)

        if type == 'all':
            db = pd.read_csv('../dataset/Utilities/meta_cuisine.csv')
        elif type == 'meta':
            db = pd.read_csv('../dataset/Utilities/newDatabase.csv', names = ['dishId', 'tags'])
        db=db.dropna()
        print("food db size= ",db.shape)
        print(db.head())
        
        dishes = pd.read_csv('../dataset/Utilities/id_name_mapping.csv', names = ['dishId', 'dish_name'])
        dishes=dishes.dropna()
        print("dishes size= ",dishes.shape)
        print(dishes.head())
        
        predicted_test_error, predictions = main(data, db, predict_on = predict_on, include_flavours = flavours)
        
        predictions = pd.DataFrame(predictions, columns = ['dishId', 'rating'])
        predictions = predictions.merge(dishes, on = 'dishId', how = 'left')
        predictions.columns = ['dishId', 'rating', 'dishName']

        if flavours:
            if os.path.exists("../dataset/Utilities/tfidf_final_flavour_scores.pickle"):
                final_scores = pickle.load(open("../dataset/Utilities/tfidf_final_flavour_scores.pickle", "rb" ))
                final_scores[predict_on] = predictions
            else:
                final_scores = {}
                final_scores[predict_on] = predictions

            pickle.dump(final_scores, open('../dataset/Utilities/tfidf_final_flavour_scores.pickle', 'wb'))

        else:
            if os.path.exists("../dataset/Utilities/tfidf_final_scores.pickle"):
                final_scores = pickle.load(open("../dataset/Utilities/tfidf_final_scores.pickle", "rb" ))
                final_scores[predict_on] = predictions

            else:
                final_scores = {}
                final_scores[predict_on] = predictions

            pickle.dump(final_scores, open('../dataset/Utilities/tfidf_final_scores.pickle', 'wb'))

    data = data[data.userId == predict_on]
    data["dishId"]=data["dishId"].astype(np.int64)
    original_rating = data.merge(predictions, how = 'left', on = 'dishId')
    original_rating.columns = ['dishId', 'userId', 'rating', 'reformed', 'dishName']
    
    time_end = time.time()

    answer = {"user" : predict_on, "predicted_test_error": predicted_test_error, "time" : round(time_end - time_start, 2), "predicted_rating" : predictions, "original_rating" : original_rating}
    #answer = {"user" : predict_on, "predicted_test_error": predicted_test_error, "time" : round(time_end - time_start, 2)}
    return answer


In [14]:
start(retrain=True)

  res_values = method(rvalues)


before elimination = (106489, 3)
  dishId  userId  rating
0    998    9974       5
1    998   10340       5
2    998   12047       5
3    998   13451       5
4    998    9974       5
after elimination = (69913, 3)
food db size=  (1380, 2)
   dishId                                               tags
0       1  Ethyl Lactate|3,4-Dihydroxybenzaldehyde|DL-Liq...
1       2  AC1LDI49|56424-87-4|3,4-Dihydroxybenzaldehyde|...
2       3  3-Methyl-1-butanol|Thymol|2-Nonanone|Pyrrolidi...
3       4  AC1LDI49|56424-87-4|2-Hexenyl propanoate|3,4-D...
4       5  3,4-Dihydroxybenzaldehyde|DL-Liquiritigenin|2-...
dishes size=  (1381, 2)
   dishId                  dish_name
0       1   curried green bean salad
1       2                 keema aloo
2       3                    paratha
3       4    black chana with potato
4       5  tomato cucumber kachumbar
In main total dishes=  1380
after tokenizing db=  (1380, 3)
   dishId                                               tags  \
0       1  Ethyl Lactate|

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


{'user': 100,
 'predicted_test_error': 0.49264960426279847,
 'time': 1355.84,
 'predicted_rating':       dishId  rating                      dishName
 0          1     NaN      curried green bean salad
 1          2     NaN                    keema aloo
 2          3     NaN                       paratha
 3          4     NaN       black chana with potato
 4          5     NaN     tomato cucumber kachumbar
 ...      ...     ...                           ...
 1375    1376     NaN              watermelon juice
 1376    1377     NaN              watermelon juice
 1377    1378     NaN         white sauce for pasta
 1378    1379     NaN         white sauce for pasta
 1379    1380     NaN  white sauce recipe for pasta
 
 [1380 rows x 3 columns],
 'original_rating': Empty DataFrame
 Columns: [dishId, userId, rating, reformed, dishName]
 Index: []}

In [15]:
data = pd.read_csv('../dataset/Utilities/newReview.csv')
data=data.mask(data.eq('None')).dropna()
print("before elimination =",data.shape)
print(data.head())
data = data[data['userId'].isin(data['userId'].value_counts()[data['userId'].value_counts() >= 5].index)]
print("after elimination =",data.shape)


  res_values = method(rvalues)


before elimination = (106489, 3)
  dishId  userId  rating
0    998    9974       5
1    998   10340       5
2    998   12047       5
3    998   13451       5
4    998    9974       5
after elimination = (69913, 3)


In [23]:
profile = None;type = 'meta';predict_on = 9974;flavours = False;retrain = True

if profile:
    data = append_to_data(data, profile, predict_on)

if type == 'all':
    db = pd.read_csv('../dataset/Utilities/meta_cuisine.csv')
elif type == 'meta':
    db = pd.read_csv('../dataset/Utilities/newDatabase.csv', names = ['dishId', 'tags'])
db=db.dropna()
print("food db size= ",db.shape)
print(db.head())



food db size=  (1380, 2)
   dishId                                               tags
0       1  Ethyl Lactate|3,4-Dihydroxybenzaldehyde|DL-Liq...
1       2  AC1LDI49|56424-87-4|3,4-Dihydroxybenzaldehyde|...
2       3  3-Methyl-1-butanol|Thymol|2-Nonanone|Pyrrolidi...
3       4  AC1LDI49|56424-87-4|2-Hexenyl propanoate|3,4-D...
4       5  3,4-Dihydroxybenzaldehyde|DL-Liquiritigenin|2-...


In [17]:
dishes = pd.read_csv('../dataset/Utilities/id_name_mapping.csv', names = ['dishId', 'dish_name'])
dishes=dishes.dropna()
print("dishes size= ",dishes.shape)
print(dishes.head())


dishes size=  (1381, 2)
   dishId                  dish_name
0       1   curried green bean salad
1       2                 keema aloo
2       3                    paratha
3       4    black chana with potato
4       5  tomato cucumber kachumbar


In [24]:

predicted_test_error, predictions = main(data, db, predict_on = predict_on, include_flavours = flavours)

predictions = pd.DataFrame(predictions, columns = ['dishId', 'rating'])
predictions = predictions.merge(dishes, on = 'dishId', how = 'left')
predictions.columns = ['dishId', 'rating', 'dishName']

if flavours:
    if os.path.exists("../dataset/Utilities/tfidf_final_flavour_scores.pickle"):
        final_scores = pickle.load(open("../dataset/Utilities/tfidf_final_flavour_scores.pickle", "rb" ))
        final_scores[predict_on] = predictions
    else:
        final_scores = {}
        final_scores[predict_on] = predictions

    pickle.dump(final_scores, open('../dataset/Utilities/tfidf_final_flavour_scores.pickle', 'wb'))

else:
    if os.path.exists("../dataset/Utilities/tfidf_final_scores.pickle"):
        final_scores = pickle.load(open("../dataset/Utilities/tfidf_final_scores.pickle", "rb" ))
        final_scores[predict_on] = predictions

    else:
        final_scores = {}
        final_scores[predict_on] = predictions

    pickle.dump(final_scores, open('../dataset/Utilities/tfidf_final_scores.pickle', 'wb'))



In main total dishes=  1380
after tokenizing db=  (1380, 3)
   dishId                                               tags  \
0       1  Ethyl Lactate|3,4-Dihydroxybenzaldehyde|DL-Liq...   
1       2  AC1LDI49|56424-87-4|3,4-Dihydroxybenzaldehyde|...   
2       3  3-Methyl-1-butanol|Thymol|2-Nonanone|Pyrrolidi...   
3       4  AC1LDI49|56424-87-4|2-Hexenyl propanoate|3,4-D...   
4       5  3,4-Dihydroxybenzaldehyde|DL-Liquiritigenin|2-...   

                                              tokens  
0  [ethyl lactate, 3,4-dihydroxybenzaldehyde, dl-...  
1  [ac1ldi49, 56424-87-4, 3,4-dihydroxybenzaldehy...  
2  [3-methyl-1-butanol, thymol, 2-nonanone, pyrro...  
3  [ac1ldi49, 56424-87-4, 2-hexenyl propanoate, 3...  
4  [3,4-dihydroxybenzaldehyde, dl-liquiritigenin,...  
flavour shape=  (1381, 7)
   dishId  bitter  rich   salt  spicy  sweet  umami
0       1   0.961  0.71  4.567   5.20   3.84      1
1       2   3.876  4.50  0.240   4.56   0.38      8
2       3   0.000  2.00  2.725  10.00   0.1

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [41]:
data = data[data.userId == predict_on]
data["dishId"]=data["dishId"].astype(np.int64)
original_rating = data.merge(predictions, how = 'left', on = 'dishId')
original_rating.columns = ['dishId', 'userId', 'rating', 'reformed', 'dishName']

time_end = time.time()

answer = {"user" : predict_on, "predicted_test_error": predicted_test_error, "time" : round(time_end - 0, 2), "predicted_rating" : predictions, "original_rating" : original_rating}
print(answer)

{'user': 9974, 'predicted_test_error': 0.49264960426279847, 'time': 1585329269.24, 'predicted_rating':       dishId  rating                      dishName
0          1     5.0      curried green bean salad
1          2     5.0                    keema aloo
2          3     5.0                       paratha
3          4     5.0       black chana with potato
4          5     5.0     tomato cucumber kachumbar
...      ...     ...                           ...
1375    1376     5.0              watermelon juice
1376    1377     5.0              watermelon juice
1377    1378     5.0         white sauce for pasta
1378    1379     5.0         white sauce for pasta
1379    1380     5.0  white sauce recipe for pasta

[1380 rows x 3 columns], 'original_rating':     dishId  userId  rating  reformed              dishName
0      998    9974       5       5.0       acai fruit bowl
1      998    9974       5       5.0       acai fruit bowl
2     1188    9974       5       5.0  broccoli pesto pasta
3   