---

# Turn Nutrients/ Moods into Matrix 

In [1]:
import pandas as pd

Define moods for nutrient deficiency and surplus

# Deficiency 

In [2]:
nutrient_deficiency_moods = {
    "Water": ["Exhaustion", "Headache", "Confusion"],
    "Calories": ["Apathy", "Weakness", "Lethargy"],
    "Protein": ["Anxiety", "Sadness", "Tension"],
    "Fats (Lipid Tot)": ["Forgetfulness", "Low_energy", "Mood_swings"],
    "Carbohydrates": ["Brain_fog", "Lethargy", "Moodiness"],
    "Fiber": ["Irritation", "Discomfort"],
    "Sugars": ["Weariness", "Drained"],
    "Calcium": ["Jittery", "Tension", "Mood_swings"],
    "Iron": ["Weariness", "Listlessness", "Boredom"],
    "Magnesium": ["Irritability", "Worry", "Obsessive_thoughts"],
    "Phosphorus": ["Weakness", "Confusion"],
    "Potassium": ["Drained", "Flat"],
    "Sodium": ["Tension", "Foggy"],
    "Zinc": ["Disinterest", "Gloominess", "Irritability"],
    "Copper": ["Fatigue", "Sadness"],
    "Selenium": ["Sadness", "Worry", "Lethargy"],
    "Vitamin C": ["Irritability", "Boredom", "Fatigue"],
    "Thiamin (Vitamin B1)": ["Anxiety", "Moodiness"],
    "Riboflavin (Vitamin B2)": ["Irritability", "Low_energy"],
    "Niacin (Vitamin B3)": ["Mood_swings", "Sadness", "Lethargy"],
    "Pantothenic Acid (Vitamin B5)": ["Irritability", "Restive"],
    "Vitamin B6": ["Nervousness", "Sadness", "Self_critical"],
    "Folate": ["Confusion", "Depression"],
    "Folic Acid": ["Low_mood", "Guilt", "Irritability"],
    "Food Folate": ["Lethargy", "Confusion"],
    "Folate (DFE)": ["Listless", "Detached"],
    "Choline": ["Foggy", "Indifferent"],
    "Vitamin B12": ["Discouraged", "Depressed", "Guilt"],
    "Vitamin A (Iu)": ["Confusion", "Sadness"],
    "Vitamin A": ["Irritability", "Mood_swings"],
    "Retinol": ["Unfocused", "Listless"],
    "Vitamin E": ["Unsettled", "Blunted"],
    "Vitamin D": ["Gloom", "Dismal", "Depression", "Sleeplessness"],
    "Vitamin D (Iu)": ["Somber", "Unmotivated"],
    "Saturated Fats": ["Edgy", "Anxious"],
    "Monounsaturated Fats": ["Diminished", "Flat"],
    "Polyunsaturated Fats": ["Fatigue", "Moodiness"],
    "Cholesterol": ["Underwhelmed", "Low_energy"]
}

In [3]:
# List all deficiency moods
deficiency_moods = list(set(mood for moods in nutrient_deficiency_moods.values() for mood in moods))
print("All deficiency moods:", deficiency_moods)

All deficiency moods: ['Detached', 'Confusion', 'Jittery', 'Irritation', 'Gloom', 'Dismal', 'Restive', 'Lethargy', 'Depressed', 'Somber', 'Mood_swings', 'Flat', 'Discomfort', 'Foggy', 'Low_energy', 'Brain_fog', 'Forgetfulness', 'Exhaustion', 'Boredom', 'Irritability', 'Anxiety', 'Sleeplessness', 'Listlessness', 'Tension', 'Diminished', 'Self_critical', 'Nervousness', 'Sadness', 'Obsessive_thoughts', 'Low_mood', 'Guilt', 'Underwhelmed', 'Moodiness', 'Headache', 'Discouraged', 'Anxious', 'Drained', 'Unmotivated', 'Fatigue', 'Listless', 'Weariness', 'Unfocused', 'Worry', 'Unsettled', 'Gloominess', 'Edgy', 'Indifferent', 'Disinterest', 'Blunted', 'Apathy', 'Weakness', 'Depression']


In [4]:
#count
len(deficiency_moods)

52

In [5]:
#turn into df
df_deficiency = pd.DataFrame(index=nutrient_deficiency_moods.keys(), data={'moods':nutrient_deficiency_moods.values()})
df_deficiency['moods']=df_deficiency['moods'].apply(lambda x: ' '.join(x))
df_deficiency

Unnamed: 0,moods
Water,Exhaustion Headache Confusion
Calories,Apathy Weakness Lethargy
Protein,Anxiety Sadness Tension
Fats (Lipid Tot),Forgetfulness Low_energy Mood_swings
Carbohydrates,Brain_fog Lethargy Moodiness
Fiber,Irritation Discomfort
Sugars,Weariness Drained
Calcium,Jittery Tension Mood_swings
Iron,Weariness Listlessness Boredom
Magnesium,Irritability Worry Obsessive_thoughts


# Surplus

In [6]:
nutrient_surplus_moods = {
    "Water": ["Restlessness", "Confusion", "Irritability"],
    "Calories": ["Lethargy", "Moodiness", "Sluggishness"],
    "Protein": ["Irritability", "Fatigue", "Dullness"],
    "Fats (Lipid Tot)": ["Sluggishness", "Moodiness", "Dullness"],
    "Carbohydrates": ["Sluggishness", "Fatigue", "Moodiness"],
    "Fiber": ["Bloating", "Discomfort"],
    "Sugars": ["Fatigue", "Irritability"],
    "Calcium": ["Irritability", "Confusion"],
    "Iron": ["Anxiety", "Restlessness", "Irritability"],
    "Magnesium": ["Drowsiness", "Confusion", "Lethargy"],
    "Sodium": ["Irritability", "Moodiness"],
    "Zinc": ["Dullness", "Irritability"],
    "Copper": ["Nausea", "Fatigue"],
    "Selenium": ["Anxiety", "Moodiness"],
    "Vitamin C": ["Nausea", "Discomfort"],
    "Thiamin (Vitamin B1)": ["Restlessness", "Irritability"],
    "Niacin (Vitamin B3)": ["Flushing", "Discomfort"],
    "Pantothenic Acid (Vitamin B5)": ["Nausea", "Restlessness"],
    "Vitamin B6": ["Irritability", "Restlessness"],
    "Folate": ["Confusion", "Dullness"],
    "Folic Acid": ["Confusion", "Lethargy"],
    "Choline": ["Nausea", "Fatigue"],
    "Vitamin B12": ["Restlessness", "Moodiness"],
    "Vitamin A (Iu)": ["Dizziness", "Nausea"],
    "Vitamin A": ["Dullness", "Lethargy"],
    "Vitamin E": ["Fatigue", "Nausea"],
    "Vitamin D": ["Irritability", "Confusion"],
    "Saturated Fats": ["Irritability", "Moodiness"],
    "Monounsaturated Fats": ["Bloating", "Discomfort"],
    "Polyunsaturated Fats": ["Dizziness", "Moodiness"],
    "Cholesterol": ["Fatigue", "Moodiness"]
}

In [7]:
# List all deficiency moods
surplus_moods = list(set(mood for moods in nutrient_surplus_moods.values() for mood in moods))
print("All surplus moods:", surplus_moods)

All surplus moods: ['Confusion', 'Flushing', 'Dizziness', 'Lethargy', 'Drowsiness', 'Restlessness', 'Dullness', 'Bloating', 'Sluggishness', 'Nausea', 'Discomfort', 'Fatigue', 'Irritability', 'Anxiety', 'Moodiness']


In [8]:
#count
len(surplus_moods)

15

In [9]:
# turn into df
df_surplus = pd.DataFrame(index=nutrient_surplus_moods.keys(), data={'moods':nutrient_surplus_moods.values()})
df_surplus['moods']=df_surplus['moods'].apply(lambda x: ' '.join(x))
df_surplus

Unnamed: 0,moods
Water,Restlessness Confusion Irritability
Calories,Lethargy Moodiness Sluggishness
Protein,Irritability Fatigue Dullness
Fats (Lipid Tot),Sluggishness Moodiness Dullness
Carbohydrates,Sluggishness Fatigue Moodiness
Fiber,Bloating Discomfort
Sugars,Fatigue Irritability
Calcium,Irritability Confusion
Iron,Anxiety Restlessness Irritability
Magnesium,Drowsiness Confusion Lethargy


In [None]:
# corpus
all_moods = list(set(deficiency_moods + surplus_moods))

In [11]:
len(all_moods)

60

In [12]:
all_moods_corpus = list(df_deficiency.moods) + list(df_surplus.moods)
all_moods_corpus

['Exhaustion Headache Confusion',
 'Apathy Weakness Lethargy',
 'Anxiety Sadness Tension',
 'Forgetfulness Low_energy Mood_swings',
 'Brain_fog Lethargy Moodiness',
 'Irritation Discomfort',
 'Weariness Drained',
 'Jittery Tension Mood_swings',
 'Weariness Listlessness Boredom',
 'Irritability Worry Obsessive_thoughts',
 'Weakness Confusion',
 'Drained Flat',
 'Tension Foggy',
 'Disinterest Gloominess Irritability',
 'Fatigue Sadness',
 'Sadness Worry Lethargy',
 'Irritability Boredom Fatigue',
 'Anxiety Moodiness',
 'Irritability Low_energy',
 'Mood_swings Sadness Lethargy',
 'Irritability Restive',
 'Nervousness Sadness Self_critical',
 'Confusion Depression',
 'Low_mood Guilt Irritability',
 'Lethargy Confusion',
 'Listless Detached',
 'Foggy Indifferent',
 'Discouraged Depressed Guilt',
 'Confusion Sadness',
 'Irritability Mood_swings',
 'Unfocused Listless',
 'Unsettled Blunted',
 'Gloom Dismal Depression Sleeplessness',
 'Somber Unmotivated',
 'Edgy Anxious',
 'Diminished Flat',


---
# Apply CountVectorizer & get 2 matrices: full_matrix_def and full_matrix_sur 
# 2 matrices with same amount of columns ready for cos sim

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(
    #
    ###################################################################
    # convert all characters to lowercase before tokenizing
    #
    #lowercase = True, 
    #
    ###################################################################
    # choose words to create features
    #
    analyzer = 'word',
    #
    ###################################################################
    # use built-in stop word list for English (default=None)
    #
    stop_words = 'english',
    #
    ###################################################################
    # select tokens of 2 or more word characters (punctuation ignored) 
    #
    token_pattern = r"(?u)\b\w\w+\b",
    #
    ###################################################################
    # consider only unigrams of tokens
    #
    ngram_range = (1, 1)
    #
    ###################################################################  
)

In [14]:
vec = count_vectorizer.fit(all_moods_corpus)

In [15]:
import pickle

# Assuming `vec` is your fitted CountVectorizer object
with open("vec.pkl", "wb") as file:
    pickle.dump(vec, file)

In [16]:
# create vec.transform for deficiency moods
t = vec.transform(df_deficiency.moods).todense()

In [17]:
full_matrix_def = pd.DataFrame(t,
             columns=vec.get_feature_names_out(),
             index=df_deficiency.index
             )
full_matrix_def

Unnamed: 0,anxiety,anxious,apathy,bloating,blunted,boredom,brain_fog,confusion,depressed,depression,...,sluggishness,somber,tension,underwhelmed,unfocused,unmotivated,unsettled,weakness,weariness,worry
Water,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Calories,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Protein,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Fats (Lipid Tot),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Carbohydrates,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fiber,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sugars,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Calcium,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Iron,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Magnesium,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [18]:
full_matrix_def.rename(index=lambda x: x.lower(), inplace=True)

In [19]:
type(full_matrix_def)

pandas.core.frame.DataFrame

In [20]:
full_matrix_def.to_csv("full_matrix_def.csv", index=True)

In [21]:
common_moods_def = set(full_matrix_def.columns).intersection(set([m.lower() for m in all_moods]))


In [22]:
set(full_matrix_def.columns) - common_moods_def

set()

In [23]:
# create vec. for surplus moods
k = vec.transform(df_surplus.moods).todense()

In [24]:
full_matrix_sur = pd.DataFrame(k,
             columns=vec.get_feature_names_out(),
             index=df_surplus.index
             )
full_matrix_sur

Unnamed: 0,anxiety,anxious,apathy,bloating,blunted,boredom,brain_fog,confusion,depressed,depression,...,sluggishness,somber,tension,underwhelmed,unfocused,unmotivated,unsettled,weakness,weariness,worry
Water,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Calories,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Protein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fats (Lipid Tot),0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Carbohydrates,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Fiber,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sugars,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Calcium,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Iron,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Magnesium,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
full_matrix_sur.rename(index=lambda x: x.lower(), inplace=True)

In [26]:
full_matrix_sur.to_csv("full_matrix_sur.csv", index=True)
#convert index to a normal column 
# fix index when I read the csv 

In [27]:
full_matrix_sur

Unnamed: 0,anxiety,anxious,apathy,bloating,blunted,boredom,brain_fog,confusion,depressed,depression,...,sluggishness,somber,tension,underwhelmed,unfocused,unmotivated,unsettled,weakness,weariness,worry
water,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
calories,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
protein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fats (lipid tot),0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
carbohydrates,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
fiber,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sugars,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
calcium,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
iron,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
magnesium,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
matrix_sur = pd.read_csv("full_matrix_sur.csv", index_col=0)
matrix_sur.head()

Unnamed: 0,anxiety,anxious,apathy,bloating,blunted,boredom,brain_fog,confusion,depressed,depression,...,sluggishness,somber,tension,underwhelmed,unfocused,unmotivated,unsettled,weakness,weariness,worry
water,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
calories,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
protein,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fats (lipid tot),0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
carbohydrates,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [29]:
common_moods_sur = set(full_matrix_sur.columns).intersection(set([m.lower() for m in all_moods]))

In [30]:
set(full_matrix_sur.columns) - common_moods_sur

set()

In [31]:
vec.transform(df_surplus.moods)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 69 stored elements and shape (31, 60)>

---

# Compute Cosine Similarity with User Input

## Deficiency

In [32]:
#define user input for def
user_mood_1 = ["disinterest", "depressed", "anxiety"]
user_query_vector_1 = vec.transform([' '.join(user_mood_1)])
user_query_vector_1.todense()

matrix([[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [33]:
#def
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between the user vector and each nutrient's mood profile
similarity_scores_input = cosine_similarity(full_matrix_def, user_query_vector_1)

# Convert similarity scores to a DataFrame for readability
similarity_input_df = pd.DataFrame(similarity_scores_input, index=full_matrix_def.index, columns=["Similarity"])

# Sort by similarity score to find the best matches
similarity_input_df_sorted = similarity_input_df.sort_values(by="Similarity", ascending=False)

# Only show the top 3 results
top_3_nutrients_def_df = similarity_input_df_sorted.head(3)

print("Top 3 Deficiency Nutrients' Similarity for User Input:\n", top_3_nutrients_def_df)


Top 3 Deficiency Nutrients' Similarity for User Input:
                       Similarity
thiamin (vitamin b1)    0.408248
protein                 0.333333
zinc                    0.333333


In [34]:
top_3_nutrients_def_df

Unnamed: 0,Similarity
thiamin (vitamin b1),0.408248
protein,0.333333
zinc,0.333333


In [35]:
type(top_3_nutrients_def_df)

pandas.core.frame.DataFrame

In [36]:
top_3_nutrients_def_list = top_3_nutrients_def_df.index.tolist()
top_3_nutrients_def_list

['thiamin (vitamin b1)', 'protein', 'zinc']

## Surplus

In [37]:
#define user input
user_mood_2 = ["Confusion"]
user_query_vector_2 = vec.transform([' '.join(user_mood_2)])
user_query_vector_2.todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [38]:
# Calculate cosine similarity between the user vector and each nutrient's mood profile
similarity_scores_input_sur = cosine_similarity(full_matrix_sur, user_query_vector_2)

# Convert similarity scores to a DataFrame for readability
similarity_input_df_sur = pd.DataFrame(similarity_scores_input_sur, index=full_matrix_sur.index, columns=["Similarity"])

# Sort by similarity score to find the best matches
similarity_input_df_sorted_sur = similarity_input_df_sur.sort_values(by="Similarity", ascending=False)

# Filter for nutrients with similarity > 0
top_matches_sur = similarity_input_df_sorted_sur[similarity_input_df_sorted_sur["Similarity"] > 0]

top_3_nutrients_sur_df = top_matches_sur.head(3)
top_3_nutrients_sur_list = top_3_nutrients_sur_df.index.tolist()
top_3_nutrients_sur_list

['vitamin d', 'calcium', 'folic acid']

In [39]:
top_3_nutrients_def_list

['thiamin (vitamin b1)', 'protein', 'zinc']

In [40]:
top_3_nutrients_sur_list

['vitamin d', 'calcium', 'folic acid']

# Get food input

In [41]:
#read data 
food_df =  pd.read_csv('food_data_lower_corrected.csv')
food_df

Unnamed: 0,Nutrient,Foods
0,alpha carotene,"carrot, carrot juice, pumpkin, carrots, chili ..."
1,ash,"salt, seasoning mix, leavening agents, pace, j..."
2,beta carotene,"carrot, paprika, grape leaves, chili powder, t..."
3,beta cryptoxanthin,"paprika, chili powder, persimmons, papayas, ro..."
4,calcium,"leavening agents, spices, savory, marjoram, di..."
5,calories,"fat, lard, animal fat, meat drippings (lard, c..."
6,carbohydrates,"sweetener, sugar, sugars, chewing gum, flan, c..."
7,cholesterol,"caviar, pate, roe, egg, sea lion, eggs, egg cu..."
8,choline,"caviar, roe, egg, eggs, pate, soy prot isolate..."
9,copper,"pepeao, sesame seeds, cocoa, baking chocolate,..."


In [42]:

# Convert all nutrient names and food items to lowercase and strip any whitespace
food_dict = food_df.groupby('Nutrient')['Foods'].apply(
    lambda foods: [food.lower().strip() for food in foods]
).to_dict()

# Display a sample of the transformed data to confirm the format
sample_food_dict = {k: food_dict[k] for k in list(food_dict)[:5]}
sample_food_dict

{'alpha carotene': ["carrot, carrot juice, pumpkin, carrots, chili powder, egg rolls, grape leaves, paprika, plantains, dandelion greens, fiddlehead ferns, dandelion grns, pimento, babyfood  dinner  chick stew  todd, tomato&veg juc, kumquats, chicken pot pie, campbell's, stinging nettles, spices, rice bowl w/ chick, tangerines, vegetable juc cocktail, split pea w/ ham soup, hyacinth-beans, tomatoes, corn grain, corn flr, cornmeal, popcorn"],
 'ash': ['salt, seasoning mix, leavening agents, pace, jellyfish, sisymbrium sp. seeds, chervil, spices, coriander leaf, celery flakes, yeast extract spread, meat extender, miso, dill weed, marjoram, chili powder, parsley, steelhead trout, rice bran, savory, celery seed, tomato powder, fennel seed, capers, sage, papad, paprika, cumin seed, kraft foods, bologna  pork  turkey & bf'],
 'beta carotene': ['carrot, paprika, grape leaves, chili powder, tomato powder, carrot juice, sweet potato, carrots, mustard grns, turnip greens, kale, dandelion greens,

# Refine food data
---

In [43]:
food_dict.keys()

dict_keys(['alpha carotene', 'ash', 'beta carotene', 'beta cryptoxanthin', 'calcium', 'calories', 'carbohydrates', 'cholesterol', 'choline', 'copper', 'fats (lipid tot)', 'fiber', 'folate', 'folate (dfe)', 'folic acid', 'food folate', 'iron', 'lutein and zeaxanthin', 'lycopene', 'magnesium', 'manganese', 'monounsaturated fats', 'niacin (vitamin b3)', 'pantothenic acid (vitamin b5)', 'phosphorus', 'polyunsaturated fats', 'potassium', 'protein', 'retinol', 'riboflavin (vitamin b2)', 'saturated fats', 'selenium', 'sodium', 'sugars', 'thiamin (vitamin b1)', 'vitamin a', 'vitamin a (iu)', 'vitamin b12', 'vitamin b6', 'vitamin c', 'vitamin d', 'vitamin d (iu)', 'vitamin e', 'vitamin k', 'water', 'zinc'])

In [44]:
len(food_dict.keys())

46

In [45]:
full_matrix_def.rename(index=lambda x: x.lower(), inplace=True)

In [46]:
full_matrix_sur.rename(index=lambda x: x.lower(), inplace=True)

In [47]:
full_matrix_def.index

Index(['water', 'calories', 'protein', 'fats (lipid tot)', 'carbohydrates',
       'fiber', 'sugars', 'calcium', 'iron', 'magnesium', 'phosphorus',
       'potassium', 'sodium', 'zinc', 'copper', 'selenium', 'vitamin c',
       'thiamin (vitamin b1)', 'riboflavin (vitamin b2)',
       'niacin (vitamin b3)', 'pantothenic acid (vitamin b5)', 'vitamin b6',
       'folate', 'folic acid', 'food folate', 'folate (dfe)', 'choline',
       'vitamin b12', 'vitamin a (iu)', 'vitamin a', 'retinol', 'vitamin e',
       'vitamin d', 'vitamin d (iu)', 'saturated fats', 'monounsaturated fats',
       'polyunsaturated fats', 'cholesterol'],
      dtype='object')

In [48]:
full_matrix_sur.index

Index(['water', 'calories', 'protein', 'fats (lipid tot)', 'carbohydrates',
       'fiber', 'sugars', 'calcium', 'iron', 'magnesium', 'sodium', 'zinc',
       'copper', 'selenium', 'vitamin c', 'thiamin (vitamin b1)',
       'niacin (vitamin b3)', 'pantothenic acid (vitamin b5)', 'vitamin b6',
       'folate', 'folic acid', 'choline', 'vitamin b12', 'vitamin a (iu)',
       'vitamin a', 'vitamin e', 'vitamin d', 'saturated fats',
       'monounsaturated fats', 'polyunsaturated fats', 'cholesterol'],
      dtype='object')

In [49]:
sur_def_set = set(list(full_matrix_sur.index) + list(full_matrix_def.index))

In [50]:
food_set = set(food_dict.keys())

In [51]:
nutrients_to_keep = food_set.intersection(sur_def_set)
nutrients_to_keep

{'calcium',
 'calories',
 'carbohydrates',
 'cholesterol',
 'choline',
 'copper',
 'fats (lipid tot)',
 'fiber',
 'folate',
 'folate (dfe)',
 'folic acid',
 'food folate',
 'iron',
 'magnesium',
 'monounsaturated fats',
 'niacin (vitamin b3)',
 'pantothenic acid (vitamin b5)',
 'phosphorus',
 'polyunsaturated fats',
 'potassium',
 'protein',
 'retinol',
 'riboflavin (vitamin b2)',
 'saturated fats',
 'selenium',
 'sodium',
 'sugars',
 'thiamin (vitamin b1)',
 'vitamin a',
 'vitamin a (iu)',
 'vitamin b12',
 'vitamin b6',
 'vitamin c',
 'vitamin d',
 'vitamin d (iu)',
 'vitamin e',
 'water',
 'zinc'}

In [52]:
# dictionary comprehension
food_dict_refined = {k:v for k,v in food_dict.items() if k in nutrients_to_keep}
len(food_dict_refined)

38

In [53]:
# revert dictionary 
# melt/ flip  pandasframe 

food_dict_refined

{'calcium': ['leavening agents, spices, savory, marjoram, dill weed, celery seed, sage, sisymbrium sp. seeds, dill seed, poppy seed, chervil, rosemary, coriander leaf, fennel seed, parsley, cinnamon, poultry seasoning, sesame seeds, cumin seed, kraft free singles american nonfat past process chs product, coriander seed, imitation chs, caraway seed, pumpkin pie spice, cheese food, anise seed, cloves, chia seeds, toddl form, incaparina'],
 'calories': ['fat, lard, animal fat, meat drippings (lard, caribou, margarine, macadamia nuts, margarine-like, butter, pecans, mayonnaise drsng, brazilnuts, hickorynuts, baking chocolate, pine nuts, hazelnuts or filberts, hazelnuts, formulated, walnuts, sunflower sd butter, almond butter, butternuts, peanut butter w/ omega-3, mixed nuts, sesame butter, peanut butter, cashew butter, pork, sunflower sd krnls, almonds'],
 'carbohydrates': ['sweetener, sugar, sugars, chewing gum, flan, cornstarch, pectin, sweeteners, butter replcmnt, carob flour, tapioca, 

In [54]:

# Convert to a DataFrame if it’s not already one
food_dict_refined_df = pd.DataFrame(food_dict_refined)

# Save as a CSV file
food_dict_refined_df.to_csv("food_dict_refined.csv", index=False)

In [55]:
#wrap in function
top_3_nutrients_def_df.index

Index(['thiamin (vitamin b1)', 'protein', 'zinc'], dtype='object')

In [56]:
type(top_3_nutrients_def_df)

pandas.core.frame.DataFrame

In [57]:
type(top_3_nutrients_def_list)

list

In [58]:
type(top_3_nutrients_sur_list)

list

In [59]:
top_3_nutrients_sur_list

['vitamin d', 'calcium', 'folic acid']

In [60]:
full_matrix_def.index

Index(['water', 'calories', 'protein', 'fats (lipid tot)', 'carbohydrates',
       'fiber', 'sugars', 'calcium', 'iron', 'magnesium', 'phosphorus',
       'potassium', 'sodium', 'zinc', 'copper', 'selenium', 'vitamin c',
       'thiamin (vitamin b1)', 'riboflavin (vitamin b2)',
       'niacin (vitamin b3)', 'pantothenic acid (vitamin b5)', 'vitamin b6',
       'folate', 'folic acid', 'food folate', 'folate (dfe)', 'choline',
       'vitamin b12', 'vitamin a (iu)', 'vitamin a', 'retinol', 'vitamin e',
       'vitamin d', 'vitamin d (iu)', 'saturated fats', 'monounsaturated fats',
       'polyunsaturated fats', 'cholesterol'],
      dtype='object')

In [61]:
full_matrix_sur.index

Index(['water', 'calories', 'protein', 'fats (lipid tot)', 'carbohydrates',
       'fiber', 'sugars', 'calcium', 'iron', 'magnesium', 'sodium', 'zinc',
       'copper', 'selenium', 'vitamin c', 'thiamin (vitamin b1)',
       'niacin (vitamin b3)', 'pantothenic acid (vitamin b5)', 'vitamin b6',
       'folate', 'folic acid', 'choline', 'vitamin b12', 'vitamin a (iu)',
       'vitamin a', 'vitamin e', 'vitamin d', 'saturated fats',
       'monounsaturated fats', 'polyunsaturated fats', 'cholesterol'],
      dtype='object')

In [62]:
#list comprehension
food_contents_to_add = [v for k,v in food_dict_refined.items() if k in top_3_nutrients_def_df.index]

list comprehension break down 
Purpose: It creates a new list, food_contents_to_add, that contains certain values from a dictionary, food_dict_refined, based on a condition.
Structure:
Expression (v): v is the value from each key-value pair in food_dict_refined. This is the value that will be added to food_contents_to_add if the condition is met.
Loop (for k, v in food_dict_refined.items()): This part iterates through each key-value pair in food_dict_refined. Here, k represents the key, and v represents the value.
Condition (if k in top_3_similar_nutrients_def.index): This is a filter condition. It checks if the key k is present in top_3_similar_nutrients_def.index. Only if this condition is True will the value v be included in the resulting list.
Usage:
This comprehension is filtering values from food_dict_refined based on whether their corresponding keys are in top_3_similar_nutrients_def.index.
The resulting list food_contents_to_add will contain only those values from food_dict_refined whose keys match the indices in top_3_similar_nutrients_def.index.

In [63]:
# list of list
food_contents_to_add 

[['soy prot isolate, soy protein isolate, gelatins, seal, steelhead trout, vital wheat gluten, whale, soy prot conc, pork skins, walrus, peanut flour, sesame flour, soy meal, cottonseed meal, sunflower sd flr, meat extender, cottonseed flr, sesame flr, soy flr, soy flour, lupins, safflower sd meal, mutton, beef jerky, cottonseed krnls, bacon bits, hormel pillow pak sliced turkey pepperoni, chckn, pumpkin&squash sd krnls, elk'],
 ['yeast extract spread, worthington foods, worthington stripples, morningstar farms brkfst bacon strips, luncheon slices, worthington prosage links, worthington wham (roll), worthington meatless corned bf roll, worthington smoked turkey roll, sunflower sd flr, worthington fripats, morningstar farms grillers original, rice bran, veggie burgers or soyburgers  unprep, sesame meal, sesame flr, sesame flour, cereals rte, cottonseed meal, cottonseed flr, worthington dinner rst, incaparina, wheat germ, worthington stakelets, sunflower sd krnls, schiff, loma linda swis

In [64]:
# list of list to list
food_contents_to_add = list(set([item for k, v in food_dict_refined.items() if k in top_3_nutrients_def_df.index for item in v]))
food_contents_to_add

['oyster, incaparina, cereals rte, cottonseed meal, wheat germ, cottonseed flr, sesame flr, pumpkin&squash seeds, sesame meal, sesame sd krnls, sesame flour, baking chocolate, hyacinth bns, chervil, beef jerky, poppy seed, pumpkin&squash sd krnls, whale, elk, sesame seeds, pepeao, wheat bran, spices, celery seed, cocoa, wocas, rice bran, kashi black bean mango, cottonseed krnls, wild rice',
 'soy prot isolate, soy protein isolate, gelatins, seal, steelhead trout, vital wheat gluten, whale, soy prot conc, pork skins, walrus, peanut flour, sesame flour, soy meal, cottonseed meal, sunflower sd flr, meat extender, cottonseed flr, sesame flr, soy flr, soy flour, lupins, safflower sd meal, mutton, beef jerky, cottonseed krnls, bacon bits, hormel pillow pak sliced turkey pepperoni, chckn, pumpkin&squash sd krnls, elk',
 'yeast extract spread, worthington foods, worthington stripples, morningstar farms brkfst bacon strips, luncheon slices, worthington prosage links, worthington wham (roll), wo

# Vectorize food data 

In [65]:
# Convert `food_contents_to_add` into a format suitable for vectorization
food_texts = [' '.join(food_contents_to_add)]  # Each food item as a single "document"

# Initialize and fit the vectorizer
vectorizer_food = CountVectorizer()
food_vectors = vectorizer_food.fit_transform(food_texts).todense()
food_vectors

matrix([[ 2,  1,  1,  2,  1,  1,  1,  1,  3,  1,  1,  1,  3,  1,  1,  1,
          1,  1,  1,  1,  8,  1,  2,  1,  1,  2,  5,  9,  1,  1,  1,  2,
          1,  1,  1,  1,  1,  2,  2,  2,  1,  6,  1,  1,  1,  1,  1,  1,
          1,  7,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  2,  1,  3,  3,  3,  1,  2,  1,  1,  7,  1,  2,  2,
         10,  1,  1,  1,  1,  6,  1,  1,  1,  3,  1,  1,  1,  1,  1,  3,
          1,  1,  2,  1,  1,  1,  1,  2,  1,  4,  1,  1,  9,  1]])

### Load credentials 

#### [Groq](https://groq.com/about-us/)

> engine providing fast AI inference (conclusion from brand new data) in the cloud

In [69]:
#load credentials
from dotenv import load_dotenv
load_dotenv()

True

In [70]:
#define llm

import warnings
warnings.filterwarnings("ignore")
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

**What is a Prompt?**
>- set of instructions or input for an LLM provided by a user to guide its response
>- helps it understand the context and generate relevant and coherent language-based output

In [71]:
# Define prompt template 
from langchain.prompts.prompt import PromptTemplate

In [72]:
query = """
    given the information {food_contents_to_add} about foods that should be contained in the recipe I want you to give:
    1. A suitable recipe
    2. Two interesting facts about them
    3. Most important nutrients they contain
    """

In [73]:
prompt_template = PromptTemplate(
    input_variables=["food_contents_to_add"],
    template=query
)

In [74]:
#Define chain

# allows to link the output of one LLM call as the input of another
# The `|` symbol chains together the different components, feeding the output from one component as input into the next component.
# In this chain the user input is passed to the prompt template, then the prompt template output is passed to the model. 
chain = prompt_template | llm

In [75]:
# invoke chain 

text_data ="""
The recipe contains all relevant nutrients and on purpose not the ones that should be avoided. 
"""

In [76]:
output = chain.invoke(input={"food_contents_to_add": text_data})

In [77]:
print(output.content)

Based on the information that the recipe contains all relevant nutrients and on purpose excludes the ones that should be avoided, I'm assuming that the recipe is a balanced and healthy one. Here's a suitable recipe, two interesting facts, and the most important nutrients for each ingredient:

**Recipe:**

**Quinoa and Vegetable Stir-Fry with Chicken**

Ingredients:

* 1 cup cooked quinoa
* 1 cup mixed vegetables (bell peppers, carrots, broccoli)
* 1 lb boneless, skinless chicken breast
* 2 tablespoons olive oil
* 2 cloves garlic, minced
* 1 teaspoon grated ginger
* Salt and pepper to taste
* Fresh cilantro leaves for garnish

**Ingredients:**

1. **Quinoa**

Interesting fact 1: Quinoa is a pseudo-cereal, meaning it's not a true cereal, but rather a flowering plant that produces edible seeds. It's a complete protein, meaning it contains all nine essential amino acids.

Most important nutrients: Quinoa is rich in protein, fiber, iron, magnesium, and manganese.

Interesting fact 2: Quinoa

# Summarize and chunk text data

In [78]:
# recipe generator 
from langchain_community.document_loaders import PyPDFLoader

def load_pdf_data(pdf_path):
    """
    this function loads text data from pdf file
    """
    loader = PyPDFLoader(file_path=pdf_path)
    documents = loader.load()
    return documents

In [81]:
recipe_docs = load_pdf_data(pdf_path = "german_recipe_book.pdf")

In [82]:
print(f"number of loaded pages: {len(recipe_docs)}")

number of loaded pages: 108


show page content

In [83]:
print(recipe_docs[7].page_content)

Page viiVegetables and Side Dishes 62Baby Peas & Carrots (Erbsen und Karotten) 62Red Cabbage (Rotkohl) 62Sweet-Sour Red Cabbage 62Curly Kale (Grünkohl) 62Black Forest Asparagus 63Deep-Fried Asparagus (Spargeln in Backteig) 63Sautéed Cucumbers (Schmorgurken) 63Cauliflower in White Sauce (Blumenkohl) 64Deep-Fried Vegetables (Gemüse in Backteig) 64Vegetable Croquettes (Gemüsekroketten) 64Puréed Peas (Erbsenpüree) 64Caraway Potatoes (Backofenkartoffeln) 65Sautéed Potatoes (Butterkartoffeln) 65Boiled Potatoes (Geschmeltzte Kartoffeln) 65Potatoes, Curds and Oil 65Potato Ring (Kartoffelring) 65Potato Croquettes (Kroketten) 66Potato Pancakes #1 (Kartoffelpuffer) 66Potato Pancakes #2 with Rosy Applesauce 66Westphalian Potato Pancakes with Smoked Salm-on and Chive Sour Cream 67Nudel Gratin (German Macaroni and Cheese) 67Potato Dumplings (Semmelknödel) 68Thuringian Dumplings (Rohe Kartoffelklösse) 68Braised Sauerkraut (Geschmortes Sauerkraut) 69Savoy Cabbage (Wirsing) 69Baked Cauliflower (Blumenk

### Split Document into Chunks

- not possible to feed the whole content into the LLM at once because of finite context window
- even models with large window sizes may struggle to find information in very long inputs and perform very badly
- chunk the document into pieces: helps retrieve only the relevant information from the corpus

In [84]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=800, chunk_overlap=80):
    """
    this function splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

In [85]:
recipe_chunks = split_documents(recipe_docs)

number of chunks created

In [86]:
# number of chunks created 
print(f"number of chunks created: {len(recipe_chunks)}")

number of chunks created: 358


## Create Embeddings 
### Finding numerical representations of text chunks

In [105]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS #Facebook AI Similarity Search

def create_embedding_vector_db(chunks, db_name):
    """
    this function uses the open-source embedding model HuggingFaceEmbeddings 
    to create embeddings and store those in a vector database called FAISS, 
    which allows for efficient similarity search
    """
    # instantiate embedding model
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    # save vector database locally
    vectorstore.save_local(f"../vector_databases/vector_db_{db_name}")

In [88]:
type(recipe_docs)

list

In [89]:
recipe_docs

[Document(metadata={'source': 'german_recipe_book.pdf', 'page': 0}, page_content='Great German RecipesGerman Style Recipes200+ Classic Dishes\nPresented by:The International German American Society'),
 Document(metadata={'source': 'german_recipe_book.pdf', 'page': 1}, page_content='Presented by: The International German American SocietyCopyright © 2014-2023 Lake Buena Vista PublishingGreat German Recipes&German Style Recipes200+ Classic Dishes'),
 Document(metadata={'source': 'german_recipe_book.pdf', 'page': 2}, page_content='Page iiGreat German Recipes&German Style Recipes200+ Classic DishesCopyright © 2014-2023Lake Buena Vista PublishingISBN: 979-8-9880533-0-9DEDICATIONWe dedicate this book to the countless German-speaking immigrants who risked traveling to the New World, bringing their honored culture, traditions, and great food for all to share.Presented By:The International German American Society\nACKNOWLEDGMENTSWe thank the many people who worked together to compile the recipes

In [90]:
print([type(chunk) for chunk in recipe_chunks])


[<class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>, <class 'langchain_core.documents.base.Document'>,

In [106]:
create_embedding_vector_db(chunks=recipe_chunks, db_name="recipe")

## Retrieve from vector database

In [92]:
def retrieve_from_vector_db(vector_db_path):
    """
    this function splits out a retriever object from a local vector database
    """
    # instantiate embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    recipe_vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True
    )
    retriever = recipe_vectorstore.as_retriever()
    return retriever

In [93]:
recipe_retriever = retrieve_from_vector_db("../vector_databases/vector_db_recipe")

In [94]:
type(recipe_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

## Generation

### chain passing documents to llm 

[`create_stuff_documents_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html#langchain.chains.combine_documents.stuff.create_stuff_documents_chain)
- takes a list of documents and formats them all into a prompt, then passes that prompt to an LLM
- passes ALL documents, so you should make sure it fits within the context window of the LLM being used

In [95]:
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain


#### chain passing user inquiry to retriever object 

[`create_retrieval_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html#langchain.chains.retrieval.create_retrieval_chain)

- takes in a user inquiry, which is then passed to the retriever to fetch relevant documents
- those documents (and original inputs) are then passed to an LLM to generate a response


In [96]:
from langchain.chains.retrieval import create_retrieval_chain

### connect chains

In [97]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

### output generations

In [98]:
recipe_retrieval_chain = connect_chains(recipe_retriever)

In [99]:
output = recipe_retrieval_chain.invoke(
    {f"input": "Give me one recipe suggestion that contains {food_contents_to_add} but not {food_contents_not_to_add}."}
)

In [100]:
type(output)

dict

In [101]:
output.keys()

dict_keys(['input', 'context', 'answer'])

In [102]:
print(output['answer'])

Based on the context, I'm going to suggest a recipe that contains yogurt, herbs, chili sauce, or other similar ingredients but does not contain bacon grease, onions, or flour.

Here's a recipe that fits the bill:

Page 12: Amuse-bouche - pre-appetizer

Pretzel

This recipe seems to be a simple, bite-sized snack that can be enjoyed before the main meal. It doesn't contain any ingredients from the list you provided, so it should meet your requirements.
