In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import json # for storing and managing data
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split
#import surprise #note: package name to download is "scikit-surprise" but the actual module is just "surprise"

# sources:
#    https://towardsdatascience.com/recommendation-systems-explained-a42fc60591ed
#    https://www.datacamp.com/tutorial/recommender-systems-python
       
# TODO:
    # figure out how to include this into the repo
    # figure out how we want to store the CSVs into the database
    # figure out how the below code might need to be modified to make it work with the database
    # get more test/verif data to improve rmse
    # process NAME field and include it in the content similarity portion
    # figure out why libreCalc is constantly overwriting the CSVs and getting them mixed up
    # set up pre/post filtering to restrict results to things like "low-carb" "vegetarian" etc

In [2]:

if __name__ == '__main__':
    PATH = "MLData\dummyDataForApp.csv"#TODO: enter path to the data
    PATH_RATINGS = "MLData\mealRatings.csv"
    

In [3]:
#import data
df = pd.read_csv(PATH)
df_ratings = pd.read_csv(PATH_RATINGS)
#PANDAS -> DataFrame: 
    # can generate df from table or json, excel file, sql, etc
    # it seems like firebase can have an array-typed field, so we can assume that they are going to be stored as an array of strings

# this is all processing that should be done separately, like, when it is entered into the db as arrays
# loop through each entry
for i in range(0,df.shape[0]):
    # tokenize string to make a list, remove stop words, stemm, etc
    df.at[i, 'MAIN FLAVORS'] = df.at[i, 'MAIN FLAVORS'].split(',')
    df.at[i, 'TAGS'] = df.at[i, 'TAGS'].split(',')
    
    # readd to df as lists instead

#df.explode('MAIN FLAVORS') # does not work, not intrepreted as a list-like
#print(df_ratings) # data extracted correctly
#print(df.dtypes)

In [4]:
#content based similarity

#rmat = df_ratings.pivot_table(columns = 'meal_id', index= 'user_id', values='rating').fillna(0)
#print(rmat)

#cosine_sim = cosine_similarity(rmat, rmat)

In [5]:
#from sklearn.feature_extraction.text import TfidfVectorizer
def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    
features = ['MAIN FLAVORS', 'TAGS']

for f in features:
    df[f] = df[f].apply(clean_data)


#print(df)

In [21]:
# remove stop words and symbols from NAME so we can factor it into the features
#nltk.download("stopwords")

def processName(x):
    #print(x)
    stop = set(stopwords.words('english'))
    good_words = []
    for word in x:
        if word not in stop:
            good_words.append(word)
    return good_words

df['NAME'] = df['NAME'].apply(processName)

print(df)

    meal_id                                               NAME  CALORIES  \
0         0                        [r, e, n,  , n, l, e,  , l]       292   
1         1                           [C, h, u, l, e,  , G, u]       516   
2         2                              [r, e, l, l, n,  , l]       186   
3         3      [b, e, e, f,  , n,  ,  , r, c, e,  , b, w, l]       612   
4         4  [p, e, n, u,  , n, l, e,  , w, h,  , c, h, c, ...       727   
5         5            [u, h, w, e,  , f, u,  , c, r, b, l, e]       176   
6         6            [S, u, g, e,  , p, e, p, p, e, r,  , p]       517   
7         7      [p, n, c, h,  , r, e, l, l, n,  , k, l, l, e]       648   
8         8                           [n, k, e,  , f, h,  , c]       293   
9         9  [B, l, c, k, e, n, e,  , l, p,  , w, h,  , z, ...       203   
10       10               [A, p, r, g, u, -, u, h, r,  , f, r]       130   
11       11                  [g, e,  , r, u, b, b, e,  , l, n]       220   
12       12 

In [6]:
# combine the features we'll be using to find content similarity
# TODO: include 'name' in here at some point. Will need to strip out stop words and symbols like '&' and 'w/'
#     once that processing has been done though, can probably just be concatenated with the rest
def combine(x):
    return ' '.join(x['MAIN FLAVORS']) + ' '+ ' '.join(x['TAGS'])

df['combinedFeatures'] = df.apply(combine, axis=1)

# print(df[['combinedFeatures']]).head(2)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['combinedFeatures'])

In [8]:
cosine_sim = cosine_similarity(count_matrix, count_matrix) # this is passed into hybrid()
#cosine_sim = surprise.Dataset.load_from_df()

In [9]:
#collaborative filtering
reader = Reader()
data=Dataset.load_from_df(df_ratings[['user_id', 'meal_id', 'rating']], reader)

In [10]:
trainset, testset = train_test_split(data, test_size=0.3, random_state=10)

In [11]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24d6d43e700>

In [12]:
test_pred = svd.test(testset)

In [13]:
accuracy.rmse(test_pred, verbose=True) # generated rmse of 1.4 isn't great, need more data

RMSE: 1.4420


1.4420373180059407

In [14]:
r_id = df_ratings['user_id'].values[0]
m_id = df_ratings['meal_id'].values[0]
n_recs = 5

print(m_id)

5


In [15]:
def hybrid(user_id, meal_id, n_recs, df, ratings, cosine_sim, svd_model):
    
        # find 50 most similar meals
        similarity = list(enumerate(cosine_sim[int(meal_id)]))
        similarity = sorted(similarity, key=lambda x:x[1], reverse=True)
        similarity = similarity[1:50] # select the first 50 similar values, likely don't want so many but this is a good starting point
    
        #print(similarity)
        
        # get the metadata of THIS meal
        meal_index = [i[0] for i in similarity]
        
        #print(meal_index)
        #print(df)
        meals = df.iloc[meal_index][['meal_id']]# get all the meal_ids#, 'user_id', 'rating']] 
        meals = meals.merge(ratings, on='meal_id') # merge with the ratings
        print(meals)
  
        
        # predict similar values using svd_model
        meals['est'] = meals.apply(lambda x: svd_model.predict(user_id, x['meal_id'], x['rating']).est, axis =1)
        
        #sort predictions in decreasing order; return the number of recommendations requested
        meals = meals.sort_values('est', ascending=False)
        return meals.head(n_recs) 
        

In [16]:
df.merge(df_ratings, on="meal_id")

hybrid(r_id, m_id, n_recs, df, df_ratings, cosine_sim, svd)

    meal_id  user_id  rating
0        14        2     4.5
1        44        8     4.5
2         7        1     4.5
3        17        8     3.5
4        20        1     5.0
5        20        8     4.0
6        20        9     5.0
7        42        8     5.0
8        15        5     2.0
9        15        8     1.0
10       15        9     2.0
11       16        8     3.0
12       24        1     2.0
13       24        5     2.5
14       24        6     4.5
15       26        6     4.0
16       34        2     4.5
17       38        6     2.0
18       25        1     4.5
19       33        8     3.0
20       10        1     4.0
21       10        2     3.0
22       37        1     2.0
23       28        1     3.5
24       28        2     2.0
25       28        9     2.0
26        0        9     4.0
27        1        6     5.0
28        3        2     4.0
29        3        9     4.0
30        4        9     3.5
31        8        9     1.0
32       11        5     4.5
33       11   

Unnamed: 0,meal_id,user_id,rating,est
2,7,1,4.5,3.670316
18,25,1,4.5,3.600909
44,41,5,4.0,3.565734
45,41,9,4.0,3.565734
16,34,2,4.5,3.505866


In [None]:
print(df)
print(df_ratings)
