## Data getting

In [15]:
import pandas as pd

# Load the dataset
data = pd.read_csv('./data/foods-en-us.csv')

# Display the first few rows
print(data.head())


   Food_ID                  Name        C_Type  Veg_Non  \
0        1   summer squash salad  Healthy Food      veg   
1        2  chicken minced salad  Healthy Food  non-veg   
2        3  sweet chilli almonds         Snack      veg   
3        4       tricolour salad  Healthy Food      veg   
4        5        christmas cake       Dessert      veg   

                                            Describe  
0  white balsamic vinegar, lemon juice, lemon rin...  
1  olive oil, chicken mince, garlic (minced), oni...  
2  almonds whole, egg white, curry leaves, salt, ...  
3  vinegar, honey/sugar, soy sauce, salt, garlic ...  
4  christmas dry fruits (pre-soaked), orange zest...  


## Data preprocessing

I am going to use One-Hot Encoding for the categorical features to numerical features to be used to train the model. I will also use the StandardScaler to scale the numerical features.

In [16]:
# One-Hot Encoding of C_Type and Veg_Non
data_encoded = pd.get_dummies(data, columns=['C_Type', 'Veg_Non'])

In [17]:
data_encoded.head()

Unnamed: 0,Food_ID,Name,Describe,C_Type_ Korean,C_Type_Beverage,C_Type_Chinese,C_Type_Dessert,C_Type_French,C_Type_Healthy Food,C_Type_Indian,...,C_Type_Japanese,C_Type_Korean,C_Type_Mexican,C_Type_Nepalese,C_Type_Snack,C_Type_Spanish,C_Type_Thai,C_Type_Vietnames,Veg_Non_non-veg,Veg_Non_veg
0,1,summer squash salad,"white balsamic vinegar, lemon juice, lemon rin...",False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
1,2,chicken minced salad,"olive oil, chicken mince, garlic (minced), oni...",False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
2,3,sweet chilli almonds,"almonds whole, egg white, curry leaves, salt, ...",False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
3,4,tricolour salad,"vinegar, honey/sugar, soy sauce, salt, garlic ...",False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
4,5,christmas cake,"christmas dry fruits (pre-soaked), orange zest...",False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True


For the Ingredients column, I decided to use the Text Vectorization TF-IDF (Term Frequency-Inverse Document Frequency) to convert the list of ingredients into a numerical feature vector

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'Describe' column
tfidf_matrix = tfidf.fit_transform(data['Describe'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF features with the original data
data_final = pd.concat([data_encoded, tfidf_df], axis=1)

# Drop the original 'Describe' column as it's now represented in TF-IDF
data_final.drop(columns=['Describe', 'Name'], inplace=True)

In [20]:
# Display the processed data
print(data_final.head())


   Food_ID  C_Type_ Korean  C_Type_Beverage  C_Type_Chinese  C_Type_Dessert  \
0        1           False            False           False           False   
1        2           False            False           False           False   
2        3           False            False           False           False   
3        4           False            False           False           False   
4        5           False            False           False            True   

   C_Type_French  C_Type_Healthy Food  C_Type_Indian  C_Type_Italian  \
0          False                 True          False           False   
1          False                 True          False           False   
2          False                False          False           False   
3          False                 True          False           False   
4          False                False          False           False   

   C_Type_Japanese  ...  yolk  yolks  yougurt  young  yut  zeera      zest  \
0            F

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [24]:
# Function to get recommendations based on food name
def get_recommendations(food_name, cosine_sim=cosine_sim):
    # Find the index of the food in the data
    idx = data[data['Name'] == food_name].index[0]

    # Get the similarity scores for all foods with that food
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the foods based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar foods
    food_indices = [i[0] for i in sim_scores[1:6]]

    # Return the top 5 most similar foods
    return data['Name'].iloc[food_indices]


In [39]:
# Example: Get recommendations for 'summer squash salad'
recommendations = get_recommendations('summer squash salad')
print(recommendations)

16               baked namakpara with roasted almond dip
143                            shrimp & cilantro ceviche
160                                     spanish fish fry
163                                 green cucumber shots
220    amaranthus granola with lemon yogurt, berries ...
Name: Name, dtype: object


## MVP user profile recommendation

In [34]:
user_profile = tfidf.transform(["basil leaves, olive oil, garlic cloves"])

In [35]:
# Compute similarity between user profile and all foods
user_similarity = cosine_similarity(user_profile, tfidf_matrix)

In [36]:
# Get top recommendations
user_top_indices = user_similarity.argsort()[0][-5:]
user_recommendations = data['Name'].iloc[user_top_indices]
print(user_recommendations)

193            egg and cheddar cheese sandwich
380                   Vietnamese Chicken Salad
16     baked namakpara with roasted almond dip
0                          summer squash salad
109               chicken and mushroom lasagna
Name: Name, dtype: object
