In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install prettytable
from collections import Counter, defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import json
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import time
import pickle
import csv
import os.path
from prettytable import PrettyTable



In [0]:
def append_to_data(data, profile, predict_on):
    profile = json.loads(profile)
    dish_ids = list(map(int, profile.keys()))
    ratings = list(map(int, profile.values()))

    d = pd.DataFrame(columns = ['dishId', 'userId', 'rating'])
    d['dishId'] = dish_ids
    d['rating'] = ratings
    d['userId'] = predict_on

    data = data.append(d)
    return data

In [0]:
def tokenize_string(my_string):
    #return re.findall('[\w\-]+', my_string.lower())
    return [ele.strip() for ele in my_string.split("|")]


def tokenize(db):
    """
    The meta tags associated with each dish is broken down (tokenized) as a list of tags
    Eg: egg|flour|ghee|paratha will be tokenized as [egg, flour, ghee, paratha]
    """
    tokenlist=[]
    for index,row in db.iterrows():
        tokenlist.append(tokenize_string(row.tags))
    db['tokens']=tokenlist
    return db

In [0]:
def featurize(db, include_flavours):
    
    def tf(word, doc):
        return doc.count(word) / Counter(doc).most_common()[0][1]

    def df(word, doclist):
        return sum(1 for d in doclist if word in d)

    def tfidf(word, doc, dfdict, N):
        return tf(word, doc) * (math.log10((N / dfdict[word])))

    def getcsrmatrix(tokens,dfdict,N,vocab, dish_flavours, max_vocab):
        matrixRow_list = []
        if include_flavours:
            matrixRow_list = np.zeros((1,len(vocab) + len(dish_flavours) - 1),dtype='float')
        else:
            matrixRow_list = np.zeros((1,len(vocab)),dtype='float')
        for t in tokens:
            if t in vocab:
                matrixRow_list[0][vocab[t]] = tfidf(t,tokens,dfdict,N)

        if include_flavours:
            matrixRow_list[0][max_vocab] = dish_flavours['bitter']
            matrixRow_list[0][max_vocab] = dish_flavours['rich']
            matrixRow_list[0][max_vocab + 1] = dish_flavours['salt']
            matrixRow_list[0][max_vocab + 3] = dish_flavours['spicy']
            matrixRow_list[0][max_vocab + 2] = dish_flavours['sweet']
            matrixRow_list[0][max_vocab + 5] = dish_flavours['umami']

        return csr_matrix(matrixRow_list)

    flavour = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tastes.csv', names = ['dishId', 'bitter', 'rich', 'salt', 'spicy', 'sweet', 'umami'])
    print("flavour shape= ",flavour.shape)
    print(flavour.head())
    
    N=len(db)

    doclist = db['tokens'].tolist()
    #print("tokens = ",doclist[:6])
    
    vocab = { i:x for x,i in enumerate(sorted(list(set(i for s in doclist for i in s)))) }
    #print(vocab)
    max_vocab = max(vocab.values()) + 1
    print("max vocab= ",max_vocab)
    
    dfdict = {}
    for v in vocab.items():
        dfdict[v[0]] = df(v[0],doclist)

    csrlist = []
    for index, row in db.iterrows():
        dish_flavours = flavour[flavour.dishId == row['dishId']].to_dict(orient = 'record')[0]
        csrlist.append(getcsrmatrix(row['tokens'],dfdict,N,vocab, dish_flavours, max_vocab)) # row['dishId'] and df with flavour scores
    
    db['features'] =  csrlist
    print("after including ifidf features")
    print(db.head())
    return (db,vocab)

In [0]:
def my_train_test_split(ratings):
    
    # train_set, test_set = train_test_split(ratings, test_size = 0.20, random_state = 42,stratify=ratings["rating"])
    train_set, test_set = train_test_split(ratings, test_size = 0.20, random_state = 42)
    return train_set, test_set


def cosine_sim(a, b, include_flavours):
    """
    """

    v1 = a.toarray()[0]
    v2  = b.toarray()[0]
    def cos_sim(v1, v2):
        x = (math.sqrt(sum([i*i for i in v1]))*math.sqrt(sum([i*i for i in v2])))
        if x:
            return sum(i[0] * i[1] for i in zip(v1, v2)) / x
        else:
            return 0
    # s1 = cos_sim(v1, v2)
    # return s1
    
    s1 = cos_sim(v1[:-6], v2[:-6])
    if include_flavours:
        s2 = cos_sim(v1[-6:], v2[-6:])
        return s1 * 0.5 + s2 * 0.5
    else:
        return s1
    
    #s1=cos_sim(v1,v2)
    return s1

def make_predictions(db, ratings_train, ratings_test, include_flavours):
    
    result = []
    x = 0
    for index,row in ratings_test.iterrows():
        # mlist contains dishIds rated by the user in the train set
        mlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['dishId'])
        #print("dishes rated by user ",row["userId"]," ",mlist)
        # csr list contains tfidf scores of tags for dishes rated by the user
        csrlist = list(db.loc[db['dishId'].isin(mlist)]['features'])
        #print("csrlist ",csrlist)
        # mrlist contains scores of dishes rated by the user (dishes in mlist)
        mrlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['rating'])
        #print("mrlist ",mrlist)
        # computing similarity between dishes user rated and the current dish in the test set

        # l=[0]*len(db["dishId"])
        # for i,ele in enumerate(db["dishId"]):
        #     if(int(ele)==int(row["dishId"])):
        #         l[i]=1

        sim = [cosine_sim(c,db.loc[db['dishId'] ==row['dishId']]['features'].values[0], include_flavours) for c in csrlist]
        # computing similarity times the rating for known dish
        wan = sum([ v*mrlist[i] for i,v in enumerate(sim) if v>0])
        wadlist = [i for i in sim if i>0]
        ## check for sum(wadlist) > 1
        if len(wadlist)>0 and sum(wadlist) >= 1:
            result.append(wan/sum(wadlist))
            x = x + 1
        else:
            #print("here")
            result.append(np.mean(mrlist)) # if dish did not match with anything approx as average of users rating
    return np.array(result)


In [0]:
def main(data, db, predict_on, include_flavours):
    """
    """
    total_dishes = db.shape[0]
    print("In main total dishes= ",total_dishes)
    
    db = tokenize(db)
    print("after tokenizing db= ",db.shape)
    print(db.head())
    
    db, vocab = featurize(db, include_flavours)
    def dummy_fun(doc):
      return doc

    tfidf = TfidfVectorizer(
        analyzer='word',
        tokenizer=dummy_fun,
        preprocessor=dummy_fun,
        token_pattern=None,smooth_idf=True)
    tfidf.fit(db["tokens"])
    tfidf_features=tfidf.transform(db["tokens"])
    print("after tfidf ",tfidf_features.shape)
    db["features"]=list(tfidf_features)

    ratings_train, ratings_test = my_train_test_split(data)
    
    print("trainig shape= ",ratings_train.shape)
    print("testing shape= ",ratings_test.shape)

    predictions = make_predictions(db, ratings_train, ratings_test, include_flavours)

    predicted_test_error = mean_squared_error(ratings_test.rating, predictions) ** 0.5

    def predict_on_user(predict_on):
        ratings_test = pd.DataFrame(columns = ['userId', 'dishId'])
        ratings_test['userId'] = [predict_on] * total_dishes
        ratings_test.dishId = range(1, total_dishes + 1)
           
        predictions_uid = make_predictions(db, ratings_train, ratings_test, include_flavours)

        predictions_uid = list(enumerate(predictions_uid))

        predictions_uid = sorted(predictions_uid, key = lambda x: x[1], reverse = True)

        predictions_uid = list(map(lambda x: (x[0] + 1, x[1]), predictions_uid))

        return predictions_uid
    print("predicted_on",predict_on)
    return (predicted_test_error, predict_on_user(predict_on = predict_on))

In [0]:
import time
def start(profile = None, type = 'meta', predict_on = 9974, flavours = False, retrain = False):
    time_start = time.time()
    data = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/review.csv')
    #data=pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/unembedded_grouped_cleaned_data.csv')
    # data=data.rename(columns={"Unnamed: 0":"#","user_id":"userId","food_id":"dishId","rating":"rating"})
    # data=pd.concat([data[data["rating"]==i].sample(1300) for i in range(1,6)])
    # data=data[["userId","dishId","rating"]]
    
    print(pd.value_counts(data["rating"]))

    data=data.mask(data.eq('None')).dropna()
    data=data[data["dishId"]!=1381]
    print("before elimination =",data.shape)
    print(data.head())
    
    data = data[data['userId'].isin(data['userId'].value_counts()[data['userId'].value_counts() >= 5].index)]
    
    # predict_on=data.sample(1).iat[0,0]
    # print("predict_on",predict_on)
    #predict_on=9974
    print("after elimination =",data.shape)

    if not retrain:
        if flavours:
            final_scores = pickle.load(open("/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_flavour_scores.pickle", "rb" ))
            predictions = final_scores[predict_on]

        else:
            final_scores = pickle.load(open("/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_scores.pickle", "rb" ))
            predictions = final_scores[predict_on]

        predicted_test_error = None

    else:
        if profile:
            data = append_to_data(data, profile, predict_on)

        if type == 'all':
            db = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/meta_cuisine.csv')
        elif type == 'meta':
            db = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/newDatabase.csv', names = ['dishId', 'tags'])
            #db = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/database.csv', names = ['dishId', 'tags'])
            # newDb=pd.concat([db,db2],axis=1,sort=False)
            # newDb.head()
        #db=db.dropna()
        print("food db size= ",db.shape)
        print(db.head())
        
        dishes = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/id_name_mapping.csv', names = ['dishId', 'dish_name'])
        print("dishes size= ",dishes.shape)
        print(dishes.head())
        
        predicted_test_error, predictions = main(data, db, predict_on = predict_on, include_flavours = flavours)
        
        predictions = pd.DataFrame(predictions, columns = ['dishId', 'rating'])
        predictions = predictions.merge(dishes, on = 'dishId', how = 'left')
        predictions.columns = ['dishId', 'rating', 'dishName']

        if flavours:
            if os.path.exists("/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_flavour_scores.pickle"):
                final_scores = pickle.load(open("/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_flavour_scores.pickle", "rb" ))
                final_scores[predict_on] = predictions
            else:
                final_scores = {}
                final_scores[predict_on] = predictions

            pickle.dump(final_scores, open('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_flavour_scores.pickle', 'wb'))

        else:
            if os.path.exists("/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_scores.pickle"):
                final_scores = pickle.load(open("/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_scores.pickle", "rb" ))
                final_scores[predict_on] = predictions

            else:
                final_scores = {}
                final_scores[predict_on] = predictions

            pickle.dump(final_scores, open('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/tfidf_final_scores.pickle', 'wb'))

    data = data[data.userId == predict_on]
    data["dishId"]=data["dishId"].astype(np.int64)
    original_rating = data.merge(predictions, how = 'left', on = 'dishId')
    original_rating.columns = ['dishId', 'userId', 'rating', 'reformed', 'dishName']
    
    time_end = time.time()
    
    answer = {"user" : predict_on, "predicted_test_error": predicted_test_error, "time" : round(time_end - time_start, 2), "predicted_rating" : predictions, "original_rating" : original_rating}
    #answer = {"user" : predict_on, "predicted_test_error": predicted_test_error, "time" : round(time_end - time_start, 2)}
    return answer

In [91]:
x=start(flavours=False,retrain=True,predict_on=100)

5    18210
4     7515
3     2545
2     1173
1      765
Name: rating, dtype: int64
before elimination = (30165, 3)
   dishId  userId  rating
0     291       0       5
1     291       1       4
2     291       2       4
3     291       3       5
4     291       4       4
after elimination = (3975, 3)
food db size=  (1380, 2)
   dishId                                               tags
0       1  Ethyl Lactate|3,4-Dihydroxybenzaldehyde|DL-Liq...
1       2  AC1LDI49|56424-87-4|3,4-Dihydroxybenzaldehyde|...
2       3  3-Methyl-1-butanol|Thymol|2-Nonanone|Pyrrolidi...
3       4  AC1LDI49|56424-87-4|2-Hexenyl propanoate|3,4-D...
4       5  3,4-Dihydroxybenzaldehyde|DL-Liquiritigenin|2-...
dishes size=  (1381, 2)
   dishId                  dish_name
0       1   curried green bean salad
1       2                 keema aloo
2       3                    paratha
3       4    black chana with potato
4       5  tomato cucumber kachumbar
In main total dishes=  1380


  res_values = method(rvalues)


after tokenizing db=  (1380, 3)
   dishId  ...                                             tokens
0       1  ...  [Ethyl Lactate, 3,4-Dihydroxybenzaldehyde, DL-...
1       2  ...  [AC1LDI49, 56424-87-4, 3,4-Dihydroxybenzaldehy...
2       3  ...  [3-Methyl-1-butanol, Thymol, 2-Nonanone, Pyrro...
3       4  ...  [AC1LDI49, 56424-87-4, 2-Hexenyl propanoate, 3...
4       5  ...  [3,4-Dihydroxybenzaldehyde, DL-Liquiritigenin,...

[5 rows x 3 columns]
flavour shape=  (1381, 7)
   dishId  bitter  rich   salt  spicy  sweet  umami
0       1   0.961  0.71  4.567   5.20   3.84      1
1       2   3.876  4.50  0.240   4.56   0.38      8
2       3   0.000  2.00  2.725  10.00   0.14      0
3       4   4.672  0.87  0.294   3.37   1.91      6
4       5   0.813  0.00  6.173   8.02   3.23      6
max vocab=  1405
after including ifidf features
   dishId  ...                                           features
0       1  ...    (0, 0)\t0.5555478620337058\n  (0, 4)\t0.0249...
1       2  ...    (0, 0)\t0.5555

In [92]:
print(x)

{'user': 100, 'predicted_test_error': 1.0598732671609836, 'time': 112.2, 'predicted_rating':       dishId    rating                  dishName
0        584  4.558417          strawberry lassi
1       1373  4.550255                   waffles
2        884  4.548975         chocolate brownie
3        885  4.548975  fudgy chocolate brownies
4          3  4.543339                   paratha
...      ...       ...                       ...
1375     983  4.269363         fish steaks dijon
1376    1376  4.268147          watermelon juice
1377    1045  4.222747              lemon pickle
1378     981  4.215693       grilled fish steaks
1379    1377  4.037594          watermelon juice

[1380 rows x 3 columns], 'original_rating':    dishId  userId  rating  reformed             dishName
0     276     100       5  4.328285          mango lassi
1      15     100       5  4.318062        chicken curry
2     176     100       4  4.325148        nariyal burfi
3      16     100       5  4.399579      chick

In [93]:
predict_on=x["user"]
predicted_test_error=x["predicted_test_error"]
time=x["time"]
predictions=x["predicted_rating"]
original_rating=x["original_rating"]
y=PrettyTable()
y.field_names=["MSE","TIME"]
y.add_row([predicted_test_error,time])
print(y)
y=PrettyTable()
y.field_names=["Predicted On","dish","Predicted Rating"]
#print(original_rating)
for i in range(predictions.shape[0]):
  
  y.add_row([predict_on,predictions["dishName"][i],predictions["rating"][i]])
print(y)

+--------------------+-------+
|        MSE         |  TIME |
+--------------------+-------+
| 1.0598732671609836 | 112.2 |
+--------------------+-------+
+--------------+-------------------------------------------------------------------+--------------------+
| Predicted On |                                dish                               |  Predicted Rating  |
+--------------+-------------------------------------------------------------------+--------------------+
|     100      |                          strawberry lassi                         | 4.5584172839542365 |
|     100      |                              waffles                              | 4.550254902340791  |
|     100      |                         chocolate brownie                         | 4.548975127260314  |
|     100      |                      fudgy chocolate brownies                     | 4.548975127260314  |
|     100      |                              paratha                              | 4.543338809152074 

In [0]:
df=pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/unembedded_grouped_cleaned_data.csv')

In [0]:
df=df.rename(columns={"Unnamed: 0":"#","user_id":"userId","food_id":"dishId","rating":"rating"})
df=pd.concat([df[df["rating"]==i].sample(1300) for i in range(1,6)])
df.head()


Unnamed: 0,#,userId,dishId,rating,userReviews,foodReviews
48006,87994,6257,594,1,"['followed', 'instructions', 'to', 'the', 'let...","['very', 'good', 'easy', 'to', 'make', 'and', ..."
19462,36845,2518,714,1,"['meat', 'was', 'dry', 'and', 'veggies', 'were...","['very', 'good', 'easy', 'to', 'make', 'and', ..."
30984,55987,777,672,1,"['i', 'didn’t', 'find', 'this', 'enjoyable', '...","['very', 'good', 'easy', 'to', 'make', 'and', ..."
23666,42400,7195,549,1,"['this', 'was', 'incredibly', 'bland']","['very', 'good', 'easy', 'to', 'make', 'and', ..."
13522,25143,4944,857,1,"['this', 'was', 'absolutely', 'great']","['i', 'have', 'made', 'this', 'three', 'times'..."


In [0]:
pd.value_counts(df["rating"])

3    1300
2    1300
5    1300
1    1300
4    1300
Name: rating, dtype: int64

In [0]:
df[["userId","dishId","rating"]].head()

Unnamed: 0,userId,dishId,rating
48006,6257,594,1
19462,2518,714,1
30984,777,672,1
23666,7195,549,1
13522,4944,857,1


In [0]:
df.sample(1).iat[0,0]

51847

In [47]:
db2 = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/newDatabase.csv', names = ['dishId', 'tags'])
db = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/Team 3/database.csv', names = ['dishId', 'tags'])
newDb=pd.concat([db,db2],axis=1,sort=False)
print(newDb.shape)
newDb.head()


(1381, 4)


Unnamed: 0,dishId,tags,dishId.1,tags.1
0,1,bean|clove|curry|garlic|masala|onion|salad|sal...,1.0,"Ethyl Lactate|3,4-Dihydroxybenzaldehyde|DL-Liq..."
1,2,beef|chili|chilli|clove|coriander|cumin|garlic...,2.0,"AC1LDI49|56424-87-4|3,4-Dihydroxybenzaldehyde|..."
2,3,flour|ghee|paratha|salt|sugar,3.0,3-Methyl-1-butanol|Thymol|2-Nonanone|Pyrrolidi...
3,4,chana|corn|cumin|ginger|mango|oil|onion|potato...,4.0,"AC1LDI49|56424-87-4|2-Hexenyl propanoate|3,4-D..."
4,5,cucumber|juice|lemon|onion|salt|sugar|tomato,5.0,"3,4-Dihydroxybenzaldehyde|DL-Liquiritigenin|2-..."


  res_values = method(rvalues)


ValueError: ignored

In [0]:
data = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/review.csv')

In [0]:
db = pd.read_csv('/content/gdrive/My Drive/Final Year Project/ContentBasedFiltering/Utilities/newDatabase.csv', names = ['dishId', 'tags'])

In [83]:
data.head()

Unnamed: 0,dishId,userId,rating
0,291,0,5
1,291,1,4
2,291,2,4
3,291,3,5
4,291,4,4


In [84]:
db.head()

Unnamed: 0,dishId,tags
0,1,"Ethyl Lactate|3,4-Dihydroxybenzaldehyde|DL-Liq..."
1,2,"AC1LDI49|56424-87-4|3,4-Dihydroxybenzaldehyde|..."
2,3,3-Methyl-1-butanol|Thymol|2-Nonanone|Pyrrolidi...
3,4,"AC1LDI49|56424-87-4|2-Hexenyl propanoate|3,4-D..."
4,5,"3,4-Dihydroxybenzaldehyde|DL-Liquiritigenin|2-..."


In [88]:
set(data["dishId"]).difference(set(db["dishId"]))

{1381}

In [89]:
data[data["dishId"]!=1381].shape

(30165, 3)