In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Reference: https://www.kaggle.com/code/artemkalinin/hybrid-recommendation-system-cb-keras/notebook
# for uploading a model to tensorflow: https://www.tensorflow.org/guide/keras/save_and_serialize?authuser=4

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
meals = pd.read_csv("MLData\dummyDataForApp.csv")
ratings = pd.read_csv("MLData\mealRatings.csv")

In [13]:
# Preprocess data -- create a 'bag of words' to find similarity between meals
for i in range(0,meals.shape[0]):
    # tokenize string to make a list, remove stop words, stemm, etc
    meals.at[i, 'MAIN FLAVORS'] = meals.at[i, 'MAIN FLAVORS'].split(',')
    meals.at[i, 'TAGS'] = meals.at[i, 'TAGS'].split(',')
    
def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    
features = ['MAIN FLAVORS', 'TAGS']

for f in features:
    meals[f] = meals[f].apply(clean_data)
    
def processName(x):
    lem = WordNetLemmatizer()
    #print(x)
    stop = set(stopwords.words('english'))
    stop.add('ground') # add ground to the stopwords list, want ground beef to just be beef
    good_words = []
    x = x.split(" ")
    for word in x:
        if word not in stop:
            if word: # remove any empty strings as a result of "  " 
                good_words.append(lem.lemmatize(word)) # add the base version of the word to the list
    return good_words

# TODO: spell check
# TODO: normalize verious things, IE: wholegrain and wholewheat should be the same
meals['NAME'] = meals['NAME'].apply(processName)

print(meals)

def combine(x):
    return ' '.join(x['MAIN FLAVORS']) + ' '+ ' '.join(x['TAGS']) + ' '+ ' '.join(x['NAME'])

meals['combinedFeatures'] = meals.apply(combine, axis=1)

    meal_id                                               NAME  CALORIES  \
0         0                             [ramen, noodle, salad]       292   
1         1                               [Chuletas, Guisadas]       516   
2         2                                [tortellini, salad]       186   
3         3                         [beef, tomato, rice, bowl]       612   
4         4                          [peanut, noodle, chicken]       727   
5         5                        [southwest, tofu, scramble]       176   
6         6                           [Sausage, pepper, pasta]       517   
7         7                     [spinach, tortellini, skillet]       648   
8         8                                [naked, fish, taco]       293   
9         9                     [Blackened, tilaipia, zoodles]       203   
10       10                     [Asparagus-mushroom, frittata]       130   
11       11                             [sage, rubbed, salmon]       220   
12       12 

In [14]:
# create a vector of all words
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(meals['combinedFeatures'].values)
feature_names = vectorizer.get_feature_names_out()

In [15]:
meal_feats = pd.DataFrame(x.toarray(), columns=feature_names)
meal_feats['combined'] = meal_feats.values.tolist()

In [16]:
#print(meal_feats)
meals['combinedFeatures'] = meal_feats['combined']

In [17]:
# calculates the silimarity between THIS meal and all the others, returns the N most similar
def get_cossim(meal_id, number):
    search = meals[['meal_id', 'combinedFeatures']]
    search = search[search.meal_id != meal_id] # remove this meal from the available set (obv will be 1 because they are the same)
    search['distance'] = search['combinedFeatures'].apply(lambda x: cosine_similarity(
        np.array(x).reshape(1,-1), 
        np.array(meals.loc[meals['meal_id'] == meal_id]['combinedFeatures'].values[0]).reshape(1,-1)))
    search = search.drop(columns=['combinedFeatures']) # don't care about the features in this anymore
    search = search.explode('distance').explode('distance') # distance is list of list, this simplifies it
    return search.sort_values(by=['distance'], ascending=False)['meal_id'].head(number).values # return the N most similar

In [18]:
# calculates similarity based on user reviews (particularly THIS user's reviews, will need another method to do collaborative filtering)
def get_similar(user_id):
    rated_meals = ratings[ratings.user_id == user_id] # get a list of all the meals this user has rated
    rated_meals = rated_meals[rated_meals['rating'] >= 4.0] # discard everything that's rated less then 4.0 (can change as needed)
    top_ratings = (rated_meals.sort_values(by='rating', ascending=False).head(20)) # get the top 20 for this user
    top_ratings['rated_meal_id'] = top_ratings['meal_id']
    top_ratings = top_ratings[['user_id', 'rated_meal_id']]
    top_ratings['similar'] = top_ratings['rated_meal_id'].apply(lambda x: (get_cossim(x, 5))) # find similarity between the top rated meals
    result = [x for x in np.concatenate(top_ratings['similar'].values, axis=0).tolist() if x not in top_ratings.rated_meal_id.values.tolist()]
    return result

In [19]:

def get_top(id, top):
    # limits results to those with 'top' number of user reviews. Not useful for our dataset at present
    similar = get_similar(id)
    meal_data = pd.merge(ratings, meals, on='meal_id')
    mean_ratings = pd.DataFrame(meal_data.groupby('meal_id')['rating'].mean())
    mean_ratings['rating_count'] = pd.DataFrame(meal_data.groupby('meal_id')['rating'].count())
    mean_ratings = mean_ratings[mean_ratings['rating_count'] > 10] # gets only those ratings with more then 10 ratings (this won't work for our data, because we don't have that many)
    return mean_ratings[mean_ratings.index.isin(similar)].sort_values(by=['rating'],ascending=False).head(top)

In [20]:
user_ids = ratings['user_id'].unique().tolist()

# creates a mapping between user_ids and a normalized 0-indexed value. IE: removes gaps so 1 = 0, 2 = 1, 5 = 3
u2u_enco = {x : i for i, x in enumerate(user_ids)}
u_enco2u = {i : x for i, x in enumerate(user_ids)} # reversed mapping so 0 returns 1, 1 returns 2, etc

#print(u2u_enco)
#print(u_enco2u)

meal_ids = ratings['meal_id'].unique().tolist()
m2m_enco = {x : i for i, x in enumerate(meal_ids)} # do the same thing for meal id (note: for our datasets, this part is largely unneeded because the meal_ids are already normalized)
m_enco2m = {i : x for i, x in enumerate(meal_ids)}

ratings["user"] = ratings['user_id'].map(u2u_enco)
ratings["meal"] = ratings['meal_id'].map(m2m_enco)

num_users = len(u2u_enco)  
num_meals = len(m2m_enco)
ratings['rating'] = ratings['rating'].values.astype(np.float32)

min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])

In [21]:
ratings = ratings.sample(frac=1, random_state=42)
x = ratings[['user', 'meal']].values

y = ratings['rating'].apply(lambda x: (x-min_rating)/(max_rating - min_rating)).values

training_indices = int(0.9 * ratings.shape[0]) # take 90% for training data
x_train, x_val, y_train, y_val = (
    x[:training_indices],
    x[training_indices:],
    y[:training_indices],
    y[training_indices:],
)

In [22]:
# the keras part; custom model created through subclassing
class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_meals, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_meals = num_meals
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(num_users, embedding_size, embeddings_initializer='he_normal', embeddings_regularizer=keras.regularizers.l2(1e-6))
        self.user_bias = layers.Embedding(num_users, 1)
        self.meal_embedding=layers.Embedding(num_meals, embedding_size, embeddings_initializer='he_normal', embeddings_regularizer=keras.regularizers.l2(1e-6))
        self.meal_bias = layers.Embedding(num_meals,1)
        
    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:,0])
        user_bias = self.user_bias(inputs[:,0])
        meal_vector = self.meal_embedding(inputs[:,1])
        meal_bias = self.meal_bias(inputs[:,1])
        dot_user_meal = tf.tensordot(user_vector, meal_vector, 2)
        x = dot_user_meal + user_bias + meal_bias
        return tf.nn.sigmoid(x)
    
model = RecommenderNet(num_users, num_meals, 50) # 50 = embedding_size
model.compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(learning_rate=0.0005))

In [23]:
# apply the model to the data
history = model.fit(x = x_train, y = y_train, batch_size = 64, epochs=15, verbose =1, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
# training done, now verify with a test
user_id = 2

In [25]:
# get recommendations based on content
top = get_top(user_id, 20)
content_recommendation = top.index.values.tolist()

In [26]:

meal_df = pd.read_csv("MLData\mealRatings.csv") # have played with the initial ratings df, so refresh it
meals_tried = ratings[ratings.user_id == user_id]
not_tried = meal_df[~meal_df['meal_id'].isin(meals_tried.meal_id.values)]["meal_id"]
not_tried = list(set(not_tried).intersection(set(m2m_enco.keys())))

not_tried = [[m2m_enco.get(x)] for x in not_tried]
user_encoder = u2u_enco.get(user_id)
user_meal_array = np.hstack(([[user_encoder]]*len(not_tried), not_tried))

# predict ratings for meals not yet tried using the model we made earlier
ratings = model.predict(user_meal_array).flatten()
#sort/rank
top_rating_indices = ratings.argsort()[-20:][::-1]
recommended_meal_ids = [m_enco2m.get(not_tried[x][0]) for x in top_rating_indices]



In [27]:
# prints the top rated for this particular user, their history, don't really care about that for this
#top_meals_user = (not_tried.sort_values(by="rating", ascending=False)
 #                .head(10)
  #               .meal_id.values)
#meal_df_rows = meal_df[meal_df["meal_id"].isin(top_meals_user)]
#for row in meal_df_rows.itertuples():
 #   print(row.name, ":", row.combinedFeatures)
 

to_rec = random.sample((content_recommendation + recommended_meal_ids), 10)
recommended_meals = meal_df[meal_df["meal_id"].isin(to_rec)]
for row in recommended_meals.itertuples():
    print(row) # currently prints all the things, because not enough data

Pandas(Index=1, user_id=1, meal_id=7, rating=4.5)
Pandas(Index=7, user_id=1, meal_id=37, rating=2.0)
Pandas(Index=15, user_id=5, meal_id=11, rating=4.5)
Pandas(Index=16, user_id=5, meal_id=15, rating=2.0)
Pandas(Index=17, user_id=5, meal_id=22, rating=1.5)
Pandas(Index=20, user_id=6, meal_id=22, rating=2.0)
Pandas(Index=22, user_id=6, meal_id=11, rating=3.0)
Pandas(Index=24, user_id=6, meal_id=36, rating=4.0)
Pandas(Index=26, user_id=6, meal_id=1, rating=5.0)
Pandas(Index=36, user_id=8, meal_id=15, rating=1.0)
Pandas(Index=37, user_id=8, meal_id=33, rating=3.0)
Pandas(Index=40, user_id=9, meal_id=0, rating=4.0)
Pandas(Index=41, user_id=9, meal_id=4, rating=3.5)
Pandas(Index=46, user_id=9, meal_id=15, rating=2.0)
