# Regression using Linear Regression (Baseline Model)

## ⬇️ Imports

In [1]:
import pandas as pd
import os
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from base_preprocessing import BasePreprocessing

## 🛠️ Preprocessing

### Base Preprocessing

In [7]:
data = None
if not os.path.exists('../../data/processed/preprocessed_recipe.csv'):
    os.makedirs('../../data/processed', exist_ok=True)
    base_preprocessor = BasePreprocessing()
    raw_data = base_preprocessor.load('../../data/RAW_recipes.csv')
    data = base_preprocessor.transform(raw_data)
    data.to_csv('../../data/processed/preprocessed_recipe.csv')
else:
    data = pd.read_csv('../../data/processed/preprocessed_recipe.csv')

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,name,minutes,n_steps,description,n_ingredients,steps_string_standardize,ingredients_text,tags_text,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,0,137739,arriba baked squash mexican,55,11,autumn is my favorite time of year to cook! th...,7,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,1,31490,breakfast pizza,30,9,this recipe calls for the crust to be prebaked...,6,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,2,112140,chili,130,6,this modified version of 'mom's' chili was a h...,13,brown ground beef in larg pot add chop onion t...,"['ground beef', 'yellow onions', 'diced tomato...","['time-to-make', 'course', 'preparation', 'mai...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,3,59389,alouette potato,45,11,"this is a super easy, great tasting, make ahea...",11,place potato in a larg pot of lightli salt wat...,"['spreadable cheese with garlic and herbs', 'n...","['60-minutes-or-less', 'time-to-make', 'course...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,4,44061,amish tomato ketchup canning,190,5,my dh's amish mother raised him on this recipe...,8,"mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",352.9,1.0,337.0,23.0,3.0,0.0,28.0


### TF-IDF Preprocessing

In [None]:
steps_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
steps_features = steps_vectorizer.fit_transform(data["steps_string_standardize"])

ingredients_vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
ingredients_features = ingredients_vectorizer.fit_transform(data["ingredients_text"])

tags_vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
tags_features = tags_vectorizer.fit_transform(data["tags_text"])

In [52]:
#numerical_features = data[['n_steps', 'n_ingredients', 'token_count', 'avg_token_length', 'calories', 'total_fat', 'sugar', 'sodium', 'saturated_fat', 'carbohydrates']].values
numerical_features = data[['n_steps', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'saturated_fat', 'carbohydrates']].values

feature_matrices = [steps_features.toarray(), ingredients_features.toarray(),
                    tags_features.toarray(),
                   numerical_features]

In [53]:
X = np.hstack(feature_matrices)
y = data["minutes"].values

In [54]:
X_train, X_test, y_train, y_test, X_train_indices, X_test_indices = train_test_split(
    X, y, data.index, test_size=0.2, random_state=42
)

## Training our model and Defining Metrics


- MAE :
    - Represents how much is it off on average
    - Ex : You make predictions that are off by 15 minutes on average

- R^2 :
    - Ranges from negative to 1
    - The closer it is to 1 the better it is
    - Under 0 it performs worse then random

In [56]:
def get_metrics(y_test, y_pred):
    # MAE
    print("MAE:", mean_absolute_error(y_test, y_pred))

    # R²
    print("R²:", r2_score(y_test, y_pred))

print("Linear Regression Results:")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_metrics = get_metrics(y_test, y_pred_lr)

Linear Regression Results:
MAE: 14.74811046744496
R²: 0.7028954518576291


# Small tests

In [None]:
original_row = data.loc[X_test_indices[33]]
display(original_row)

print("y_test[33]:", y_test[33])
print("y_pred_lr[33]:", y_pred_lr[33])


id                                                                     322833
name                                            cucumber cilantro pasta salad
minutes                                                                    30
n_steps                                                                     4
description                 so cool and refreshing. if you like spicy, you...
n_ingredients                                                               9
steps_string_standardize    cook pasta , drain combin lime juic , cilantro...
ingredients_text            ['cucumber', 'roma tomato', 'red onion', 'lime...
tags_text                   ['30-minutes-or-less', 'time-to-make', 'course...
calories                                                                 34.8
total_fat                                                                 0.0
sugar                                                                    18.0
sodium                                                          

y_test[33]: 30
y_pred_lr[33]: 26.619497648857614


In [25]:
data.head()

Unnamed: 0,id,name,minutes,n_steps,description,n_ingredients,steps_string_standardize,ingredients_text,tags_text,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,137739,arriba baked squash mexican,55,11,autumn is my favorite time of year to cook! th...,7,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,breakfast pizza,30,9,this recipe calls for the crust to be prebaked...,6,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,chili,130,6,this modified version of 'mom's' chili was a h...,13,brown ground beef in larg pot add chop onion t...,"['ground beef', 'yellow onions', 'diced tomato...","['time-to-make', 'course', 'preparation', 'mai...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,alouette potato,45,11,"this is a super easy, great tasting, make ahea...",11,place potato in a larg pot of lightli salt wat...,"['spreadable cheese with garlic and herbs', 'n...","['60-minutes-or-less', 'time-to-make', 'course...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,amish tomato ketchup canning,190,5,my dh's amish mother raised him on this recipe...,8,"mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",352.9,1.0,337.0,23.0,3.0,0.0,28.0
