# Feature Engineering

Prepping data for model training. For now, we will stick with a simple approach using XGBoost, and likely use old functions from the previous version of the app to do feature engineering.

In [1]:
from sqlalchemy import create_engine, Column, Integer, String, JSON, Float
from sqlalchemy.orm import sessionmaker, declarative_base
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

load_dotenv()

postgresql_password = os.environ["POSTGRESQL_IIFYMATE_PASSWORD"]
engine = create_engine(f'postgresql://iifymate:{postgresql_password}@localhost/clean_recipes')

[nltk_data] Downloading package stopwords to /home/ravib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ravib/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ravib/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_sql('SELECT * FROM clean_recipes', engine)
df.head(2)

Unnamed: 0,id,label,serving_size,dietLabels,healthLabels,ingredientLines,ingredients,calories,totalWeight,totalTime,cuisineType,mealType,dishType,totalNutrients,tags
0,1,Mom’s Swedish Potatoes recipes,4,[],"[Sugar-Conscious, Vegetarian, Pescatarian, Egg...","[4 potatoes - 4, 1/2 cup Parmesan cheese grate...","[{'text': '4 potatoes - 4', 'quantity': 4.0, '...",1867.94925,1066.853125,0,[nordic],[lunch/dinner],[condiments and sauces],"{'Energy': {'quantity': 1867.9492500000001, 'u...","[potatoes, potato, potato dishes, swedish, swe..."
1,2,Soft Chocolate Chip Cookies,125,[Low-Sodium],"[Low Potassium, Kidney-Friendly, Vegetarian, P...","[4.5 c. white flour, 2 tsp. baking soda, 2 c. ...","[{'text': '4.5 c. white flour', 'quantity': 4....",13300.936001,2778.9,36,[american],[teatime],[biscuits and cookies],"{'Energy': {'quantity': 13300.936000549316, 'u...","[Dessert, Other, Desserts Dessert, Other Desse..."


## Ingredients Lines

The main feature is the ingredients list which is a list of ingredients, so we need it to be a full string.

In [3]:
ingredientLines = df['ingredientLines']
ingredientLines[0]

['4 potatoes - 4',
 '1/2 cup Parmesan cheese grated or shredded (optional) - (more or less to taste)',
 '1/4 cup breadcrumbs - 1/4 optional, more or less to taste',
 '1/2 cup garlic butter melted - more or less to taste)']

In [4]:
def comma_to_bracket(ingredient_list):
    """
    Input: ingredient_list (list): a list of strings, like ingredients of a recipe.
    Output: recipe (str): commas in individual elements from input string are removed, then they are all joined together with a comma, so commas seperate each ingredient now.
    """
    processed_ingredients = []
    
    for ingredient in ingredient_list:
        parts = ingredient.split(',', 1)  # Split at the first comma
        if len(parts) > 1:  # Check if there is a comma
            # Check if the part after the comma is already in brackets
            if '(' not in parts[1] and ')' not in parts[1]:
                parts[1] = f'({parts[1].strip()})'  # Put it in brackets
        processed_ingredients.append(' '.join(parts))

    # Join the processed strings with a comma and space now that we removed the commas in the individual strings
    recipe = ', '.join(processed_ingredients)

    return recipe

In [5]:
comma_to_bracket(ingredientLines[0])

'4 potatoes - 4, 1/2 cup Parmesan cheese grated or shredded (optional) - (more or less to taste), 1/4 cup breadcrumbs - 1/4 optional (more or less to taste), 1/2 cup garlic butter melted - more or less to taste)'

In [6]:
ingredientLines = ingredientLines.apply(comma_to_bracket)

## Health Labels

In [7]:
def replace_with_priority(labels):
    priority_order = ['Vegan', 'Vegetarian', 'Pescatarian', 'Paleo', 'Red-Meat-Free', 'Mediterranean']
    for label in priority_order:
        if label in labels:
            return label
    return 'Balanced'  # Handle case where no label matches priority_order, in which case the diet is balanced

Note the search parameter for Pescatarian is pecatarian, not pescatarian.

In [8]:
healthLabels = df['healthLabels']

In [9]:
for i in range(5):
    print(replace_with_priority(healthLabels[i]))

Vegetarian
Vegetarian
Vegetarian
Vegetarian
Vegan


In [10]:
healthLabels = healthLabels.apply(replace_with_priority)

## Macros

In [11]:
def get_macros(nutrients_row: dict):
    macros_dct = {}

    macros_dct['Fat'] = nutrients_row['Fat']['quantity']
    macros_dct['Protein'] = nutrients_row['Protein']['quantity']
    macros_dct['Carbohydrates (net)'] = nutrients_row['Carbohydrates (net)']['quantity']
    
    return macros_dct

In [12]:
sample_nutrients = df['totalNutrients'][0]
print(type(sample_nutrients))
sample_nutrients['Fat']

<class 'dict'>


{'quantity': 112.83458124999999, 'unit': 'g'}

In [13]:
get_macros(sample_nutrients)

{'Fat': 112.83458124999999,
 'Protein': 48.66716875,
 'Carbohydrates (net)': 151.89527062499997}

## Independent Variables

Here we want our features that will be the input for the ML model to be the recipe name, ingredents, and health labels. They will all be concatenated together to have it all as just one string.

In [14]:
healthLabels

0     Vegetarian
1     Vegetarian
2     Vegetarian
3     Vegetarian
4          Vegan
         ...    
95    Vegetarian
96    Vegetarian
97      Balanced
98    Vegetarian
99    Vegetarian
Name: healthLabels, Length: 100, dtype: object

In [15]:
X = healthLabels + " " + df['label'] + " " + ingredientLines
X[0]

'Vegetarian Mom’s Swedish Potatoes recipes 4 potatoes - 4, 1/2 cup Parmesan cheese grated or shredded (optional) - (more or less to taste), 1/4 cup breadcrumbs - 1/4 optional (more or less to taste), 1/2 cup garlic butter melted - more or less to taste)'

In [16]:
X = X.rename('fullRecipeInput')
X

0     Vegetarian Mom’s Swedish Potatoes recipes 4 po...
1     Vegetarian Soft Chocolate Chip Cookies 4.5 c. ...
2     Vegetarian Zucchini Bread with Lemon Honey But...
3     Vegetarian Crispy Zucchini Sticks with Spicy M...
4     Vegan Bourbon BBQ Sauce, Perfect for Summer Gr...
                            ...                        
95    Vegetarian Semolina Halva with Currants 125g b...
96    Vegetarian Almond Joy Muffins * 2 cups flour, ...
97    Balanced Puff Pastry Pasty 2 sheets puff pastr...
98    Vegetarian Yogurt Bowl With Citrus And Rosemar...
99    Vegetarian Chive Buttered Carrots Recipe 4 2-1...
Name: fullRecipeInput, Length: 100, dtype: object

In [17]:
def remove_stop_words(review):
    english_stop_words = stopwords.words('english')

    #get the words in the review as a list
    review_words = review.split()
    
    #make a new list with the same words but only if they are not a stop word
    removed_stop_words_list = [word for word in review_words if word not in english_stop_words]
    
    removed_stop_words = ' '.join(removed_stop_words_list)
    
    return removed_stop_words

def lemmatizing_reviews(review):
    lemmatizer = WordNetLemmatizer()

    #get review text as a list of words
    review_list = review.split()
    
    #lemmatize the words
    lemmatized_list = [lemmatizer.lemmatize(word) for word in review_list]
    
    #make it into a string again
    lemmatized_review = ' '.join(lemmatized_list)
    
    return lemmatized_review

def get_tfidf_splits(X, y, test_size=0.25, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    tfidf = TfidfVectorizer()

    tfidf_fitted = tfidf.fit(X_train.str.join(' '))

    tfidf_X_train_labels = tfidf_fitted.transform(X_train.str.join(' '))
    tfidf_X_test_labels = tfidf_fitted.transform(X_test.str.join(' '))
    tfidf_train_df = pd.DataFrame(tfidf_X_train_labels.toarray(), columns=tfidf.get_feature_names_out())
    tfidf_test_df = pd.DataFrame(tfidf_X_test_labels.toarray(), columns=tfidf.get_feature_names_out())

    return tfidf_train_df, tfidf_test_df, y_train, y_test, tfidf_fitted

def SVD_reduction(X_train, X_test, n_components=1000):
    svd = TruncatedSVD(n_components=n_components)
    svd_fitted = svd.fit(X_train)
    X_train_reduced, X_test_reduced = svd.transform(X_train), svd.transform(X_test)

    #getting column names just to convert to dataframe
    column_names = [f"component_{i+1}" for i in range(X_train_reduced.shape[1])]
    X_train_reduced_df = pd.DataFrame(X_train_reduced, columns=column_names, index=X_train.index)
    X_test_reduced_df = pd.DataFrame(X_test_reduced, columns=column_names, index=X_test.index)

    return X_train_reduced_df, X_test_reduced_df, svd_fitted


In [18]:
X = X.apply(remove_stop_words)
X = X.apply(lemmatizing_reviews)
X = X.apply(lambda x: word_tokenize(x))

In [19]:
nutrients = df['totalNutrients']

y = pd.DataFrame(list(nutrients.apply(lambda row: get_macros(row))))

In [20]:
y.head()

Unnamed: 0,Fat,Protein,Carbohydrates (net)
0,112.834581,48.667169,151.895271
1,756.21682,163.18242,1446.628
2,333.545213,48.968544,410.634439
3,93.649808,59.859046,141.431198
4,43.784801,18.865246,212.03927


## Training Splits

In [21]:
X_train, X_test, y_train, y_test, tfidf_fitted = get_tfidf_splits(X, y)


X_train, X_test, svd_fitted = SVD_reduction(X_train, X_test, n_components=500)

y_train, y_test = np.log1p(y_train), np.log1p(y_test)

In [22]:
X_train.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_66,component_67,component_68,component_69,component_70,component_71,component_72,component_73,component_74,component_75
0,0.22648,-0.140697,0.113005,0.114324,0.044962,0.075223,-2.7e-05,-0.141355,0.324709,-0.107398,...,0.0429,0.022293,-0.010431,0.040666,0.089013,-0.012655,-0.062293,0.045922,0.002983,0.052368
1,0.340595,-0.000927,-0.251554,-0.145342,-0.310863,-0.088318,0.034694,0.082445,0.045782,0.105308,...,-0.036608,0.007042,-8.5e-05,-0.097754,0.001296,-0.047521,0.039682,0.021428,0.059801,0.016525
2,0.498234,0.392715,-0.018806,-0.052989,0.105894,0.06518,0.020539,-0.104278,-0.124184,0.012171,...,-0.071309,-0.089109,-0.027824,0.228905,-0.182257,0.185085,0.061021,-0.021868,-0.047261,0.005234
3,0.258781,-0.27675,0.016255,0.324764,-0.137843,-0.001772,0.053871,-0.319701,-0.117181,-0.138065,...,-0.111904,-0.085179,0.03548,-0.052594,0.016307,0.04531,-0.044218,0.007663,0.000672,-0.018177
4,0.294368,-0.253503,-0.161766,-0.125963,-0.006361,0.168606,-0.060922,0.011016,-0.028583,-0.296809,...,-0.141183,0.070582,-0.139994,0.010694,-0.060555,0.00372,-0.019441,-0.055849,-0.009477,-0.022512


In [23]:
y_train.head()

Unnamed: 0,Fat,Protein,Carbohydrates (net)
15,5.832761,5.550206,5.759905
40,4.935185,4.001894,5.984178
96,5.397143,3.992935,6.204824
9,5.205701,5.085594,6.362206
72,1.983302,2.361232,3.293822


# Model Training

With the features from above, we can now train our model.

In [24]:
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [52]:
fat_args = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': None}
carbs_args = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': None}
protein_args = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': None}

In [53]:
def get_xgb_macro_model(X_train, X_test, y_train, y_test, macro, args):
    """
    Trains an XGBoost regressor model for predicting a specific macronutrient (carbs, fat, or protein) 
    using the recipe data from the Edamam API. Returns the trained model along with evaluation metrics.

    Parameters:
    X_train : array-like or sparse matrix, shape (n_samples, n_features)
        Training data.
    X_test : array-like or sparse matrix, shape (n_samples, n_features)
        Test data.
    y_train : DataFrame, shape (n_samples, n_targets)
        Target values for training data.
    y_test : DataFrame, shape (n_samples, n_targets)
        Target values for test data.
    macro : str
        Name of the target macronutrient variable (column) in y_train and y_test.
    args : dict
        Dictionary containing arguments to be passed to the XGBRegressor constructor.

    Returns:
    xgb_model : XGBRegressor object
        Trained XGBoost regressor model.
    r2 : float
        R-squared score on the test data.
    mse : float
        Mean squared error on the test data.
    """
    xgb = XGBRegressor(**args)
    xgb.fit(X_train, y_train[macro])
    y_pred = xgb.predict(X_test)
    r2 = r2_score(y_test[macro], y_pred)
    mse = mean_squared_error(y_test[macro], y_pred)

    return xgb, r2, mse

In [54]:
fat_xgb, fat_r2, fat_mse = get_xgb_macro_model(X_train, X_test, y_train, y_test, 'Fat', fat_args)

carbs_xgb, carbs_r2, carbs_mse = get_xgb_macro_model(X_train, X_test, y_train, y_test, 'Carbohydrates (net)', carbs_args)

protein_xgb, protein_r2, protein_mse = get_xgb_macro_model(X_train, X_test, y_train, y_test, 'Protein', protein_args)

In [55]:
y_pred = fat_xgb.predict(X_test)
r2 = r2_score(y_test['Fat'], y_pred)
mse = mean_squared_error(y_test['Fat'], y_pred)

In [56]:
print(fat_xgb)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
