In [100]:
import pandas as pd
import zipfile
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import re

In [63]:
def save_file(df, name):
    

    current_dir = os.getcwd()
    # if not exists make the directory
    save_folder = current_dir+'\Data'

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    # specify file name
    file_name = name
    
    # concatenate directory and file
    file_path = os.path.join(save_folder,file_name)
    try: 
        df.to_csv(rf"{file_path}", index = False)
        print(f"File '{file_name}' saved to '{save_folder}'")
    except Exception as e:
        print(f"Error saving file:{e}")

## Prior to running the notebook you must download the data
- Put it into another folder, at the same level of your cloned repository
- datafile name is full_dataset.csv
#### Hugging Face Link
- https://huggingface.co/datasets/recipe_nlg

In [22]:
# change the below path to match your saved location
# note that my assignments folder has a clone of the repo
path_to_file = r"C:\Users\Stephanie\Documents\School\8_Fall_2023\5_Natural_Language_Processing\assignments\NLP_Recipe_Guide_Data\full_dataset.csv"
path = r"C:\Users\Stephanie\Documents\School\8_Fall_2023\5_Natural_Language_Processing\assignments\NLP_Recipe_Guide_Data"

In [23]:
# read in data
recipe_nlg_df = pd.read_csv(path_to_file)

In [24]:
recipe_nlg_df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [25]:
recipe_nlg_df.shape

(2231142, 7)

In [26]:
recipe_nlg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 7 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   Unnamed: 0   int64 
 1   title        object
 2   ingredients  object
 3   directions   object
 4   link         object
 5   source       object
 6   NER          object
dtypes: int64(1), object(6)
memory usage: 119.2+ MB


In [27]:
recipe_nlg_df.isnull()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
2231137,False,False,False,False,False,False,False
2231138,False,False,False,False,False,False,False
2231139,False,False,False,False,False,False,False
2231140,False,False,False,False,False,False,False


In [28]:
#! pip install datasets

In [29]:
from datasets import load_dataset

dataset =load_dataset('recipe_nlg',data_dir=path)
train_data = dataset['train']
example_recipe = train_data[0]
print(example_recipe)

{'id': 0, 'title': 'No-Bake Nut Cookies', 'ingredients': ['1 c. firmly packed brown sugar', '1/2 c. evaporated milk', '1/2 tsp. vanilla', '1/2 c. broken nuts (pecans)', '2 Tbsp. butter or margarine', '3 1/2 c. bite size shredded rice biscuits'], 'directions': ['In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.', 'Stir over medium heat until mixture bubbles all over top.', 'Boil and stir 5 minutes more. Take off heat.', 'Stir in vanilla and cereal; mix well.', 'Using 2 teaspoons, drop and shape into 30 clusters on wax paper.', 'Let stand until firm, about 30 minutes.'], 'link': 'www.cookbooks.com/Recipe-Details.aspx?id=44874', 'source': 0, 'ner': ['brown sugar', 'milk', 'vanilla', 'nuts', 'butter', 'bite size shredded rice biscuits']}


# Exploratory Data Analysis

## Inspect the Data
- Size
- Display first few rows
- Data types of columns
- Look for missing or null values

In [30]:
# NOTE: will take a while to run (several minutes)
train_data_df = pd.DataFrame(train_data)

In [31]:
train_data_df.set_index('id',inplace=True)

In [32]:
train_data_df.tail()

Unnamed: 0_level_0,title,ingredients,directions,link,source,ner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2231137,Sunny's Fake Crepes,[1/2 cup chocolate hazelnut spread (recommende...,[Spread hazelnut spread on 1 side of each tort...,www.foodnetwork.com/recipes/sunny-anderson/sun...,1,"[chocolate hazelnut spread, tortillas, butter,..."
2231138,Devil Eggs,"[1 dozen eggs, 1 paprika, 1 salt and pepper to...","[Boil eggs on medium for 30mins., Then cool eg...",cookpad.com/us/recipes/355411-devil-eggs,1,"[eggs, paprika, salt, choice, miracle whip, re..."
2231139,Extremely Easy and Quick - Namul Daikon Salad,"[150 grams Daikon radish, 1 tbsp Sesame oil, 1...",[Julienne the daikon and squeeze out the exces...,cookpad.com/us/recipes/153324-extremely-easy-a...,1,"[radish, Sesame oil, White sesame seeds, Salt,..."
2231140,Pan-Roasted Pork Chops With Apple Fritters,"[1 cup apple cider, 6 tablespoons sugar, 4 tab...","[In a large bowl, mix the apple cider with 4 c...",cooking.nytimes.com/recipes/1015164,1,"[apple cider, sugar, kosher salt, bay leaves, ..."
2231141,Polpette in Spicy Tomato Sauce,"[1 pound ground veal, 1/2 pound sweet Italian ...","[Preheat the oven to 350., In a bowl, mix the ...",www.foodandwine.com/recipes/polpette-spicy-tom...,1,"[ground veal, sausage, bread crumbs, milk, gar..."


In [33]:
train_data_df.columns = train_data_df.columns.str.strip()

In [49]:
train_data_df[train_data_df['title']=='Brazilian Grilled Pineapple']

Unnamed: 0_level_0,title,ingredients,directions,link,source,ner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
935936,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",[Preheat an outdoor grill for medium-high heat...,www.allrecipes.com/recipe/235932/brazilian-gri...,0,"[brown sugar, ground cinnamon, pineapple]"


In [23]:
train_data_df.shape # (2231142, 7)

(2231142, 6)

In [24]:
train_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2231142 entries, 0 to 2231141
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   title        object
 1   ingredients  object
 2   directions   object
 3   link         object
 4   source       int64 
 5   ner          object
dtypes: int64(1), object(5)
memory usage: 119.2+ MB


In [34]:
rows_with_null = train_data_df[train_data_df.isnull().any(axis=1)]

In [35]:
len(rows_with_null)
# no null values

0

# Subset Based on All Recipes

In [36]:
all_recipes_df = train_data_df[train_data_df['link'].str.contains('www.allrecipes.com/recipe/')]

In [37]:
all_recipes_df.shape
# 61k rows

(61398, 6)

In [38]:
#save_file(all_recipes_df, 'all_recipes_df.csv')

## Exploring Categorical Variables
- count the requency of each category
- visualize the distributions using bar charts or pie charts

In [39]:
!pip install textblob



In [40]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
from collections import Counter
from textblob import TextBlob


# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Stephanie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Stephanie\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stephanie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Adding Some Variables

In [41]:
def analyze_recipe(title, ingredients, directions):
    # making stop words
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    
    ######### TITLE #################
    # tokenize title into words
    words_in_title = word_tokenize(title)
    
    # remove stop words
    filtered_words_title = [word for word in words_in_title if word.lower() not in stop_words]
    
    # number of non-stop-words in title
    num_words_title = len(filtered_words_title)
    
    # making a string of the filtered title
    filtered_words_title_str = ' '.join(filtered_words_title)
    
    # perform sentiment analysis on the filtered title    
    blob = TextBlob(filtered_words_title_str)
    sentiment = blob.sentiment
    
    polarity = sentiment.polarity
    subjectivity = sentiment.polarity  
    
    # Create bigrams from the words in title
    bigrams_title = list(ngrams(filtered_words_title,2))
        
    ######### INGREDIENTS ###############
    # Number of ingredients
    num_ingredients = len(ingredients)
        
    ######## DIRECTIONS #################
    # merge directions into a single string
    merged_directions = ' '.join(directions)
    
    # tokenize merged directions into words
    words_in_directions = word_tokenize(merged_directions)    
    
    # remove stop words
    filtered_words_directions = [
        word for word in words_in_directions 
        if word.lower() not in stop_words and word not in punctuation and not word.lower().startswith("'")
    ]
 
    # Number of words in directions
    num_words_directions = len(filtered_words_directions)

    # Create bigrams from the words in directions
    bigrams_directions = list(ngrams(filtered_words_directions, 2))

    # POS tagging for parts of speech analysis
    pos_tags = pos_tag(filtered_words_directions)

    # Count the occurrences of each part of speech
    pos_counts = Counter(tag for word, tag in pos_tags)

    # Number of steps (approximated by counting verbs 'VB')
    num_steps = len([word for word, tag in pos_tags if tag == 'VB'])
    
    return num_words_title, bigrams_title, polarity, subjectivity,  num_ingredients, num_words_directions, bigrams_directions, pos_counts, num_steps

In [42]:
def add_features(df):
    # Example recipe data
    num_words_title = []
    bigrams_title = []
    polarity = []
    subjectivity = []
    num_ingredients = []
    num_words_directions = []
    bigrams_directions = []
    pos_counts = []
    num_steps = []

    for index, row in df.iterrows():
        title = row['title']
        ingredients = row['ner']
        directions = row['directions']
        # Analyze the recipe
        #analysis_result = analyze_recipe(ingredients, directions)
        num_title, bi_title, pol, sub, num_ing, num_wd, bi_dir, pos, steps = analyze_recipe(title, ingredients, directions)
        polarity.append(pol)
        subjectivity.append(sub)
        num_words_title.append(num_title)
        bigrams_title.append(bi_title)
        num_ingredients.append(num_ing)
        num_words_directions.append(num_wd)
        bigrams_directions.append(bi_dir)
        pos_counts.append(pos)
        num_steps.append(steps)
        
    df['num_words_title'] = num_words_title
    df['bigrams_title'] = bigrams_title
    df['subjectivity'] = subjectivity
    df['polarity'] = polarity
    df['num_ingredients'] = num_ingredients
    df['num_words_directions'] = num_words_directions
    df['bigrams_directions'] = bigrams_directions
    df['pos_counts'] = pos_counts
    df['num_steps'] = num_steps
    return df


In [43]:
all_recipes_featured_df = add_features(all_recipes_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['num_words_title'] = num_words_title
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bigrams_title'] = bigrams_title
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subjectivity'] = subjectivity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

In [44]:
all_recipes_featured_df.head(2)

Unnamed: 0_level_0,title,ingredients,directions,link,source,ner,num_words_title,bigrams_title,subjectivity,polarity,num_ingredients,num_words_directions,bigrams_directions,pos_counts,num_steps
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
896341,Deconstructed Screwdriver (The Raw Egg),"[1/4 cup orange juice, or as desired, 2 (1.5 f...",[Fill a food-safe silicon-based round ice mold...,www.allrecipes.com/recipe/241895/deconstructed...,0,"[orange juice, jiggers vodka]",6,"[(Deconstructed, Screwdriver), (Screwdriver, (...",-0.230769,-0.230769,2,25,"[(Fill, food-safe), (food-safe, silicon-based)...","{'NNP': 1, 'JJ': 4, 'NN': 12, 'VBD': 3, 'CD': ...",0
896342,Kettle Corn,"[1/4 cup vegetable oil, 1/4 cup white sugar, 1...",[Heat the vegetable oil in a large pot over me...,www.allrecipes.com/recipe/20808/kettle-corn/,0,"[vegetable oil, white sugar, popcorn kernels]",2,"[(Kettle, Corn)]",0.0,0.0,3,42,"[(Heat, vegetable), (vegetable, oil), (oil, la...","{'NNP': 3, 'JJ': 7, 'NN': 11, 'VBN': 2, 'VBP':...",4


In [71]:
#save_file(all_recipes_featured_df, 'all_recipes_featured_df.csv')

# Original All Recipe Data

In [45]:
file_path = r'\Data\allrecipes-complete-recipes-list-by-dmitriy-zub.json'
directory = os.getcwd()
# go into data folder and find the json file
json_file_path = directory+file_path
#print(json_file_path)
recipes = pd.read_json(json_file_path, orient='records')

In [46]:
recipes.head(2)

Unnamed: 0,state,basic_info,prep_data,ingridients,nutritions
0,Texas,"{'title': 'Slow Cooker Texas Pulled Pork', 'ca...","{'prep_time:': '15 mins', 'cook_time:': '5 hrs...","[1 teaspoon vegetable oil, 1 (4 pound) pork sh...","{'calories': '528', 'fat': '23g', 'carbs': '46..."
1,Texas,"{'title': 'Brazilian Grilled Pineapple', 'cate...","{'prep_time:': '10 mins', 'cook_time:': '10 mi...","[1 cup brown sugar, 2 teaspoons ground cinnamo...","{'calories': '255', 'fat': '0g', 'carbs': '66g..."


In [47]:
def extract_info(row):
    try:
        # Extract 'prep_time', 'cook_time', 'total_time', 'servings', and 'yield' dynamically
        prep_time_key = next((key for key in row['prep_data'] if 'prep_time' in key), None)
        cook_time_key = next((key for key in row['prep_data'] if 'cook_time' in key), None)
        total_time_key = next((key for key in row['prep_data'] if 'total_time' in key), None)
        servings_key = next((key for key in row['prep_data'] if 'servings' in key), None)

        # Extract nutrition information if available        
        calories = row['nutritions'].get('calories')
        fat = row['nutritions'].get('fat')
        carbs = row['nutritions'].get('carbs')
        protein = row['nutritions'].get('protein')

        return pd.Series({
            'state': row['state'],
            'title': row['basic_info']['title'],
            'ingredients': row['ingridients'],
            'category': row['basic_info']['category'],
            'rating': row['basic_info']['rating'], 
            'reviews': row['basic_info']['reviews'], 
            'recipe creator': row['basic_info']['recipe_by'],
            'prep_time': row['prep_data'][prep_time_key] if prep_time_key else None,
            'cook_time': row['prep_data'][cook_time_key] if cook_time_key else None,
            'total_time': row['prep_data'][total_time_key] if total_time_key else None,
            'servings': row['prep_data'][servings_key] if servings_key else None,
            'calories': calories,
            'fat': fat,
            'carbs': carbs,
            'protein': protein
        })
    except Exception as e:
        print("Error:", e)
        return pd.Series({})

# Apply the function to each row of the DataFrame
extracted_data = recipes.apply(extract_info, axis=1)

# Display the extracted data
extracted_data.head()

Unnamed: 0,state,title,ingredients,category,rating,reviews,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Texas,Slow Cooker Texas Pulled Pork,"[1 teaspoon vegetable oil, 1 (4 pound) pork sh...",Main Dishes,\n4.5,"\n2,214 Reviews",cmccreight,15 mins,5 hrs,5 hrs 15 mins,8,528,23g,46g,32g
1,Texas,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",Side Dish,\n4.9,\n68 Reviews,SoccerNut,10 mins,10 mins,20 mins,6,255,0g,66g,1g
2,Texas,Cowboy Caviar,"[1 (15.5 ounce) can black beans, drained, 1 (1...",Appetizers and Snacks,\n4.7,\n193 Reviews,Cooknik,15 mins,,35 mins,8,233,9g,32g,8g
3,Texas,Soul Smothered Chicken,"[½ cup butter, 1 whole chicken, cut into piece...",Meat and Poultry,\n4.7,\n375 Reviews,Veronica Rockett,15 mins,1 hrs,1 hrs 15 mins,8,372,23g,22g,19g
4,Texas,Slow Cooker Texas Smoked Beef Brisket,"[3 tablespoons smoked paprika, 2 tablespoons g...",Main Dishes,\n4.3,\n81 Reviews,Sandy Clark Gerhardt,10 mins,6 hrs,6 hrs 50 mins,4,342,16g,29g,22g


In [48]:
# Removing \n from the rating and reviews column data
extracted_data['rating'] = extracted_data['rating'].apply(lambda x: x.replace('\n', '') if x else None)
extracted_data['reviews'] = extracted_data['reviews'].apply(lambda x: x.replace('\n', '') if x else None)
# changing review count to be the number rather than string (from 12 Recipes to 12)
extracted_data['reviews'] = extracted_data['reviews'].str.replace(' Reviews', '')
extracted_data = extracted_data.rename(columns={'reviews': 'review_count'})
# changing variables to numeric
extracted_data['rating'] = pd.to_numeric(extracted_data['rating'], errors='coerce')
extracted_data['review_count'] = pd.to_numeric(extracted_data['review_count'], errors='coerce')
extracted_data['servings'] = pd.to_numeric(extracted_data['servings'], errors='coerce')
extracted_data['calories'] = pd.to_numeric(extracted_data['calories'], errors='coerce')
# removing all recipes with null values
no_null = extracted_data.dropna()
no_null_df = no_null.reset_index(drop=True)
no_null_df.head(2)

Unnamed: 0,state,title,ingredients,category,rating,review_count,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Texas,Brazilian Grilled Pineapple,"[1 cup brown sugar, 2 teaspoons ground cinnamo...",Side Dish,4.9,68.0,SoccerNut,10 mins,10 mins,20 mins,6.0,255.0,0g,66g,1g
1,Texas,Soul Smothered Chicken,"[½ cup butter, 1 whole chicken, cut into piece...",Meat and Poultry,4.7,375.0,Veronica Rockett,15 mins,1 hrs,1 hrs 15 mins,8.0,372.0,23g,22g,19g


In [49]:
def clean_ingredient(ingredient):
    # This function removes numbers and extra spaces from the ingredient
    return ' '.join([part for part in ingredient.split() if not part.isdigit()])

def find_recipes_with_ingredients(ingredients_to_search, dataframe):
    # Clean the ingredients in the dataframe
    no_null_df['cleaned_ingredients'] = no_null_df['ingredients'].apply(lambda x: [clean_ingredient(ingredient) for ingredient in x])

    matching_recipes = no_null_df[no_null_df['cleaned_ingredients'].apply(
        lambda x: all(ingredient.lower() in ' '.join(x).lower() for ingredient in ingredients_to_search)
    )]

    return matching_recipes


In [50]:
ingredients_to_search = ['carrot']
matching_recipes = find_recipes_with_ingredients(ingredients_to_search, no_null_df)

In [51]:
matching_recipes
# TO DO ####
# add feature to sort by category
# add feature to sort by rating
# add feature to sort by

Unnamed: 0,state,title,ingredients,category,rating,review_count,recipe creator,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein,cleaned_ingredients
1,Texas,Soul Smothered Chicken,"[½ cup butter, 1 whole chicken, cut into piece...",Meat and Poultry,4.7,375.0,Veronica Rockett,15 mins,1 hrs,1 hrs 15 mins,8.0,372.0,23g,22g,19g,"[½ cup butter, whole chicken, cut into pieces,..."
77,Colorado,Colorado Mexican Rice,"[2 cups water, 1 cup uncooked white rice, 4 me...",Side Dish,3.9,15.0,ivette,20 mins,20 mins,40 mins,6.0,359.0,18g,40g,9g,"[cups water, cup uncooked white rice, medium t..."
82,Colorado,Rocky Mountain Stew,"[2 tablespoons vegetable oil, 2 pounds sirloin...","Soups, Stews and Chili Recipes",4.6,37.0,Debbie F.,15 mins,3 hrs,3 hrs 15 mins,10.0,281.0,11g,26g,17g,"[tablespoons vegetable oil, pounds sirloin tip..."
97,Arizona,Arizona Roadhouse Chili,"[1 pound dried pinto beans, 1 pound 90%-lean g...","Soups, Stews and Chili Recipes",3.8,3.0,Karen Barris Calabro,20 mins,4 hrs 10 mins,12 hrs 30 mins,8.0,393.0,8g,53g,27g,"[pound dried pinto beans, pound 90%-lean groun..."
101,North Carolina,Hatteras Style Clam Chowder,"[1 tablespoon vegetable oil, 1 large onion, ch...","Soups, Stews and Chili Recipes",4.4,35.0,O. Romaine,20 mins,30 mins,50 mins,8.0,265.0,4g,24g,32g,"[tablespoon vegetable oil, large onion, choppe..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286,Nevada,Ox Roast,"[½ cup chopped onion, ½ cup chopped celery, wi...",Main Dishes,4.7,9.0,Jon,20 mins,3 hrs,3 hrs 20 mins,30.0,229.0,10g,1g,31g,"[½ cup chopped onion, ½ cup chopped celery, wi..."
1288,Nevada,Harvest Beef Stew,"[4 tablespoons bacon drippings, ¼ cup flour, S...","Soups, Stews and Chili Recipes",4.2,10.0,CULINARYJEN,30 mins,1 hrs 30 mins,2 hrs,6.0,918.0,53g,44g,56g,"[tablespoons bacon drippings, ¼ cup flour, Sal..."
1316,Hawaii,Hawaiian Macaroni Salad,"[6 cups elbow macaroni, 2 cups mayonnaise, or ...",Salad,3.5,2.0,Jessica Daulton,10 mins,15 mins,2 hrs 25 mins,12.0,465.0,30g,41g,8g,"[cups elbow macaroni, cups mayonnaise, or more..."
1319,Hawaii,Hawaiian Bruddah Potato Mac (Macaroni) Salad,"[5 eggs, 7 large potatoes, peeled and cubed, 1...",Salad,4.7,39.0,Tuihalalalala,30 mins,20 mins,8 hrs 50 mins,20.0,387.0,28g,30g,6g,"[eggs, large potatoes, peeled and cubed, cup e..."


# Merging Two DF

In [15]:
#all_recipes_featured_df_sample = pd.read_csv('Data/all_recipes_featured_df_sample.csv')

In [52]:
print(no_null_df.shape)
print(all_recipes_featured_df.shape)

(1329, 16)
(61398, 15)


In [60]:
# merge
whole_df = pd.merge(no_null_df, 
                    all_recipes_featured_df, 
                    on = 'title',
                    how = 'inner')

In [61]:
whole_df.shape

(800, 30)

In [64]:
save_file(whole_df, 'merged_df.csv')

File 'merged_df.csv' saved to 'C:\Users\Stephanie\Documents\School\8_Fall_2023\5_Natural_Language_Processing\assignments\NLP_Recipe_Guide\Data'


## New Start

In [65]:
#### new start
whole_df = pd.read_csv('Data\merged_df.csv')

In [66]:
whole_df.columns

Index(['state', 'title', 'ingredients_x', 'category', 'rating', 'review_count',
       'recipe creator', 'prep_time', 'cook_time', 'total_time', 'servings',
       'calories', 'fat', 'carbs', 'protein', 'cleaned_ingredients',
       'ingredients_y', 'directions', 'link', 'source', 'ner',
       'num_words_title', 'bigrams_title', 'subjectivity', 'polarity',
       'num_ingredients', 'num_words_directions', 'bigrams_directions',
       'pos_counts', 'num_steps'],
      dtype='object')

In [67]:
whole_df.drop(['ingredients_x','cleaned_ingredients','ingredients_y','recipe creator'], axis=1,inplace=True)

In [68]:
whole_df.head(2)

Unnamed: 0,state,title,category,rating,review_count,prep_time,cook_time,total_time,servings,calories,...,ner,num_words_title,bigrams_title,subjectivity,polarity,num_ingredients,num_words_directions,bigrams_directions,pos_counts,num_steps
0,Texas,Brazilian Grilled Pineapple,Side Dish,4.9,68.0,10 mins,10 mins,20 mins,6.0,255.0,...,"['brown sugar', 'ground cinnamon', 'pineapple']",3,"[('Brazilian', 'Grilled'), ('Grilled', 'Pineap...",0.0,0.0,3,39,"[('Preheat', 'outdoor'), ('outdoor', 'grill'),...","Counter({'NN': 15, 'NNP': 5, 'JJ': 3, 'RB': 3,...",2
1,Texas,Soul Smothered Chicken,Meat and Poultry,4.7,375.0,15 mins,1 hrs,1 hrs 15 mins,8.0,372.0,...,"['butter', 'chicken', 'salt', 'ground black pe...",3,"[('Soul', 'Smothered'), ('Smothered', 'Chicken')]",-0.6,-0.6,14,78,"[('Melt', 'butter'), ('butter', 'large'), ('la...","Counter({'NN': 27, 'JJ': 12, 'NNP': 9, 'NNS': ...",1


# Machine Learning Preprocessing
## Train/Test Split

In [69]:
def convert_to_minutes(time_string):
    if pd.isnull(time_string) or time_string.strip() == '':  # Handle missing values and empty strings
        return None
    
    time_string = time_string.lower().replace('.', '')  # Normalize the string by removing periods and converting to lowercase
    time_units = time_string.split()
    
    total_minutes = 0
    days = 0
    hours = 0
    minutes = 0
    
    for idx, unit in enumerate(time_units):
        if 'day' in unit or 'days' in unit:
            days = int(time_units[idx - 1]) 
        if 'hrs' in unit:
            hours = int(time_units[idx - 1])
        if 'mins' in unit:
            minutes = int(time_units[idx - 1])
    total_minutes = (days * 24 * 60) + (hours * 60) + minutes
    return total_minutes

In [70]:
whole_df['total_time']=whole_df['total_time'].apply(convert_to_minutes)

In [71]:
whole_df['cook_time']=whole_df['cook_time'].apply(convert_to_minutes)

In [72]:
whole_df['prep_time']=whole_df['prep_time'].apply(convert_to_minutes)

In [73]:
whole_df.head(2)

Unnamed: 0,state,title,category,rating,review_count,prep_time,cook_time,total_time,servings,calories,...,ner,num_words_title,bigrams_title,subjectivity,polarity,num_ingredients,num_words_directions,bigrams_directions,pos_counts,num_steps
0,Texas,Brazilian Grilled Pineapple,Side Dish,4.9,68.0,10,10,20,6.0,255.0,...,"['brown sugar', 'ground cinnamon', 'pineapple']",3,"[('Brazilian', 'Grilled'), ('Grilled', 'Pineap...",0.0,0.0,3,39,"[('Preheat', 'outdoor'), ('outdoor', 'grill'),...","Counter({'NN': 15, 'NNP': 5, 'JJ': 3, 'RB': 3,...",2
1,Texas,Soul Smothered Chicken,Meat and Poultry,4.7,375.0,15,60,75,8.0,372.0,...,"['butter', 'chicken', 'salt', 'ground black pe...",3,"[('Soul', 'Smothered'), ('Smothered', 'Chicken')]",-0.6,-0.6,14,78,"[('Melt', 'butter'), ('butter', 'large'), ('la...","Counter({'NN': 27, 'JJ': 12, 'NNP': 9, 'NNS': ...",1


In [75]:
# category to number

whole_df.category.value_counts()

Main Dishes                       107
Desserts                           88
Side Dish                          86
Soups, Stews and Chili Recipes     86
Appetizers and Snacks              68
Meat and Poultry                   58
Seafood                            46
Cuisine                            38
Bread                              36
Pork                               19
Breakfast and Brunch               18
Salad                              17
Everyday Cooking                   13
Chicken                            12
Soup Recipes                       11
Vegetables                         11
Sauces and Condiments              10
Cookies                            10
Pies                               10
Beef                                7
Seafood Main Dishes                 6
European                            4
Dips and Spreads Recipes            4
Squash                              4
Cakes                               4
Drinks Recipes                      4
Sauces      

In [77]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(whole_df['category'])

In [80]:
whole_df['encoded_category'] = encoded_labels

In [81]:
whole_df.head(1)

Unnamed: 0,state,title,category,rating,review_count,prep_time,cook_time,total_time,servings,calories,...,num_words_title,bigrams_title,subjectivity,polarity,num_ingredients,num_words_directions,bigrams_directions,pos_counts,num_steps,encoded_category
0,Texas,Brazilian Grilled Pineapple,Side Dish,4.9,68.0,10,10,20,6.0,255.0,...,3,"[('Brazilian', 'Grilled'), ('Grilled', 'Pineap...",0.0,0.0,3,39,"[('Preheat', 'outdoor'), ('outdoor', 'grill'),...","Counter({'NN': 15, 'NNP': 5, 'JJ': 3, 'RB': 3,...",2,31


In [92]:
# converting health grams

cols = ['fat','carbs','protein']
for col in cols:
    whole_df[col] = whole_df[col].str.replace('g', '').astype(int)
    


In [94]:
whole_df['fat']

0       0
1      23
2      16
3      12
4      17
       ..
795    16
796     3
797    24
798     4
799     8
Name: fat, Length: 800, dtype: int32

In [95]:
# train testing split

# Feature selection - select relevant columns as features and target
features = ['review_count',
       'prep_time', 'cook_time', 'total_time', 'servings',
       'calories', 'fat', 'carbs', 'protein', 'subjectivity', 'polarity',
       'num_ingredients', 'num_words_directions', 'num_steps','encoded_category'] 

target = 'rating'  # Replace with your target variable

#### account for category of food

In [96]:
X= whole_df[features]
y=whole_df[target]

In [97]:
X.head()

Unnamed: 0,review_count,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein,subjectivity,polarity,num_ingredients,num_words_directions,num_steps,encoded_category
0,68.0,10,10,20,6.0,255.0,0,66,1,0.0,0.0,3,39,2,31
1,375.0,15,60,75,8.0,372.0,23,22,19,-0.6,-0.6,14,78,1,19
2,81.0,10,360,410,4.0,342.0,16,29,22,-0.3,-0.3,14,62,0,18
3,186.0,25,5,40,10.0,262.0,12,36,6,0.0,0.0,13,45,0,10
4,126.0,25,45,70,6.0,309.0,17,12,27,0.0,0.0,13,45,0,8


In [98]:
y.head()

0    4.9
1    4.7
2    4.3
3    4.7
4    4.6
Name: rating, dtype: float64

In [101]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict ratings for test data
predictions = model.predict(X_test)

# Evaluate model performance using Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.1109360402481514


In [None]:
# Predict rating for new recipe features
new_recipe_features = [...]  # Replace [...] with actual new recipe features
predicted_rating = model.predict([new_recipe_features])
print(f"Predicted Rating for the new recipe: {predicted_rating[0]}")

# Other Ideas


##  Recipe Title Analysis:
- Similar to the analysis done on the recipe directions, you can tokenize the recipe titles, count words, extract bigrams, or even perform sentiment analysis on the titles.

## Ingredient Analysis:
- You can further analyze ingredients, such as identifying common ingredients, categorizing them (e.g., protein, vegetable, seasoning), or conducting sentiment analysis to understand the tone associated with certain ingredients.

## Recipe Difficulty or Complexity:
- You might develop a metric to estimate the difficulty or complexity of the recipe based on the number of steps, variety of ingredients, or the sophistication of the language used in the instructions.

## Time Estimation:
- Analyze the recipe steps to estimate the time required to prepare the dish.

## Tools or Equipment Required:
- Extract information about tools or equipment needed (e.g., saucepan, baking sheet) by analyzing the recipe steps.

## Nutritional Analysis: 
- If ingredient quantities are included, you could estimate nutritional information for the recipe, such as calorie count, macronutrients, etc.

## Perform text preprocessing (lowercasing, tokenization, etc.)
- Analyze most frequent words or ingredients using word clouds, frequency distributions, or bar charts.
- Explore n-grams (sequences of words) to understand common ingredient combinations or recipe titles.

In [8]:
from collections import Counter

# Ingredients
words = ' '.join(dataset['ner']).lower().split()
word_freq = Counter(words)
# Get top N most common words
top_n_words = word_freq.most_common(N)

NameError: name 'dataset' is not defined

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud().generate_from_frequencies(word_freq)

# Display word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Or create a bar chart for top N words
top_words, frequencies = zip(*top_n_words)
plt.bar(top_words, frequencies)
plt.xticks(rotation=45)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()

In [8]:
from collections import Counter

# Directions
words = ' '.join(dataset['directions']).lower().split()
word_freq = Counter(words)
# Get top N most common words
top_n_words = word_freq.most_common(N)

NameError: name 'dataset' is not defined

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud().generate_from_frequencies(word_freq)

# Display word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Or create a bar chart for top N words
top_words, frequencies = zip(*top_n_words)
plt.bar(top_words, frequencies)
plt.xticks(rotation=45)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Exploring N-grams
from nltk import ngrams

# Assuming 'text_column' contains the text data
tokens = ' '.join(dataset['text_column']).lower().split()
# Create n-grams
n = 2  # Example for bigrams
ngrams_list = list(ngrams(tokens, n))

# Analyze and explore n-grams
# You can use Counter or other methods to analyze the frequency of n-grams

Count the frequency of each category.
Visualize the distribution using bar charts or pie charts.

Text Data Analysis (Ingredients, Recipe Titles, etc.):



Visualization:
Create visualizations to explore relationships or patterns:

Correlation matrix for numeric features.
Pair plots for numerical features.
Plotting graphs or histograms to visualize distributions.
Additional Analysis:

Identify outliers or anomalies.
Explore relationships between variables (e.g., cuisine and ingredients).
Perform any specific analysis based on your project goals.

In [None]:
## What are the groupings of different recipes

## Segment them based on the different ingredients
## May help with cuisine identification

# generate variables
- health score
- cuisine type

In [None]:
# finding similar recipes
# cosine similarity

# Combine 'Title', 'Ingredients', 'Directions' into a single text column for each recipe
df['combined_features'] = df['title'] + ' ' + df['ingredients'] + ' ' + df['directions']

# Use TF-IDF to vectorize the combined features
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

# Calculate cosine similarity between recipes
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get similar recipes based on cosine similarity
def get_similar_recipes(recipe_id, similarity_matrix, threshold=0.5):
    similar_recipes = []
    for idx, score in enumerate(similarity_matrix[recipe_id]):
        if idx != recipe_id and score >= threshold:
            similar_recipes.append((idx, score))
    return similar_recipes

# Example: Get similar recipes for Recipe 1 (adjust the recipe_id accordingly)
recipe_id = 0  # Change the recipe ID to find similar recipes for a different recipe
similar_recipes = get_similar_recipes(recipe_id, cosine_sim)

# Display similar recipes
print(f"Similar recipes for {df['title'][recipe_id]}:")
for recipe in similar_recipes:
    print(f"Recipe: {df['title'][recipe[0]]}, Similarity Score: {recipe[1]}")

# To train a recipe generation model
- https://www.youtube.com/shorts/x0hInMGWgWU

# Categorization
## predict rating
## predict type of dish category
## predict the state

# pull up ingredients 
## simple search
- all elements of a list are in the list of ingredients
- make one definition to help filter down the recipe ideas

