Parsing the ingredients.

1. First converting the string data into a list
2. Removing stop words, measures, junk words, etc and only keeping actual ingredients in the column using some NLP techniques 


On inspection of the data_cleaned file (in MS word), I've made a list of measures and junk words that I'll need to remove which I'll use now

In [5]:
import pandas as pd
import string
import re
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import ast

Defining the ingredient parser function

In [3]:

def ingredient_parser(ingreds):
    measures = ['teaspoon', 't', 'tsp.', 'tablespoon', 'T', 'tbl.', 'tb', 'tbsp.', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt',
                'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter',
                'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram',
                'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre',
                'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo']
    
    words_to_remove = ['fresh', 'oil', 'a', 'red', 'bunch', 'and', 'clove', 'or', 'leaf', 'chilli', 'large', 'extra', 'sprig', 'ground', 'handful', 'free',
                       'small', 'pepper', 'virgin', 'range', 'from', 'dried', 'sustainable', 'black', 'peeled', 'higher', 'welfare', 'seed', 'for', 'finely',
                       'freshly', 'sea', 'quality', 'white', 'ripe', 'few', 'piece', 'source', 'to', 'organic', 'flat', 'smoked', 'ginger', 'sliced', 'green',
                       'picked', 'the', 'stick', 'plain', 'plus', 'mixed', 'mint', 'bay', 'basil', 'your', 'cumin', 'optional', 'fennel', 'serve', 'mustard', 
                       'unsalted', 'baby', 'paprika', 'fat', 'ask', 'natural', 'skin', 'roughly', 'into', 'such', 'cut', 'good', 'brown', 'grated', 'trimmed',
                       'oregano', 'powder', 'yellow', 'dusting', 'knob', 'frozen', 'on', 'deseeded', 'low', 'runny', 'balsamic', 'cooked', 'streaky', 'nutmeg', 
                       'sage', 'rasher', 'zest', 'pin', 'groundnut', 'breadcrumb', 'turmeric', 'halved', 'grating', 'stalk', 'light', 'tinned', 'dry', 'soft', 
                       'rocket', 'bone', 'colour', 'washed', 'skinless', 'leftover', 'splash', 'removed', 'dijon', 'thick', 'big', 'hot', 'drained', 'sized', 
                       'chestnut', 'watercress', 'fishmonger', 'english', 'dill', 'caper', 'raw', 'worcestershire', 'flake', 'cider', 'cayenne', 'tbsp', 'leg',
                       'pine', 'wild', 'if', 'fine', 'herb', 'almond', 'shoulder', 'cube', 'dressing', 'with', 'chunk', 'spice', 'thumb', 'garam', 'new', 'little',
                       'punnet', 'peppercorn', 'shelled', 'saffron', 'other''chopped', 'salt', 'olive', 'taste', 'can', 'sauce', 'water', 'diced', 'package', 'italian',
                       'shredded', 'divided', 'parsley', 'vinegar', 'all', 'purpose', 'crushed', 'juice', 'more', 'coriander', 'bell', 'needed', 'thinly', 'boneless',
                       'half', 'thyme', 'cubed', 'cinnamon', 'cilantro', 'jar', 'seasoning', 'rosemary', 'extract', 'sweet', 'baking', 'beaten', 'heavy', 'seeded', 'tin',
                       'vanilla', 'uncooked', 'crumb', 'style', 'thin', 'nut', 'coarsely', 'spring', 'chili', 'cornstarch', 'strip', 'cardamom', 'rinsed', 'honey', 'cherry',
                       'root', 'quartered', 'head', 'softened', 'container', 'crumbled', 'frying', 'lean', 'cooking', 'roasted', 'warm', 'whipping', 'thawed', 'corn', 
                       'pitted','sun', 'kosher', 'bite', 'toasted', 'lasagna', 'split', 'melted', 'degree', 'lengthwise', 'romano', 'packed', 'pod', 'anchovy', 'rom',
                       'prepared', 'juiced','fluid', 'floret', 'room', 'active', 'seasoned', 'mix', 'deveined', 'lightly', 'anise', 'thai', 'size', 'unsweetened',
                       'torn', 'wedge', 'sour', 'basmati','marinara', 'dark', 'temperature', 'garnish', 'bouillon', 'loaf', 'shell', 'reggiano', 'canola', 'parmigiano',
                        'round', 'canned', 'ghee', 'crust', 'long', 'broken', 'ketchup', 'bulk', 'cleaned', 'condensed', 'sherry', 'provolone', 'cold', 'soda',
                        'cottage', 'spray', 'tamarind', 'pecorino', 'shortening', 'part', 'bottle', 'sodium', 'cocoa', 'grain', 'french', 'roast', 'stem', 'link',
                        'firm', 'asafoetida', 'mild', 'dash', 'boiling','ADVERTISEMENT', 'advertisement', 'chopped', 'only', 'minced','weed']
    
    #checking if input is a list, if not then converting string input to list
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ingreds.split()
        
    translator = str.maketrans('', '', string.punctuation)
    lemmatizer = WordNetLemmatizer() #lemmatization to get base word
    ingred_list = []
    
    for i in ingredients:
        i = i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(' |-', i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [unidecode.unidecode(word) for word in items] #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn'))
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Gets rid of measuring words/phrases, e.g. heaped teaspoon
        items = [word for word in items if word not in measures]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]
        if items:
            ingred_list.append(' '.join(items)) 
    ingred_list = " ".join(ingred_list)
    return ingred_list

parsing the cleaned_data csv file and storing the result in new file 'parsed_data'

In [15]:
if __name__ == "__main__":
    recipe_df = pd.read_csv('C:/Users/ACER/Desktop/Recipe_Recommendation/jupyter notebooks/cleaned_data.csv')
    recipe_df['Ingredients'] = recipe_df['Ingredients'].astype(str)
    recipe_df['Parsed_Ingredients'] = recipe_df['Ingredients'].apply(lambda x: ingredient_parser(x))
    df = recipe_df.dropna()
    specified_columns = ['Title','Parsed_Ingredients','Instructions']
    df = df[specified_columns]
    df.to_csv('C:/Users/ACER/Desktop/Recipe_Recommendation/jupyter notebooks/parsed_data.csv')
   

In [17]:
print(df.columns)
print(df.head(1))


Index(['Title', 'Parsed_Ingredients', 'Instructions'], dtype='object')
                               Title  \
0  Slow Cooker Chicken and Dumplings   

                                  Parsed_Ingredients  \
0  chicken breast butter cream chicken soup onion...   

                                        Instructions  
0  Place the chicken, butter, soup, and onion in ...  


Encoding the data using TF-IDF  

In [18]:
df['Parsed_Ingredients'] = df.Parsed_Ingredients.values.astype('U') #ensuring the data is treated as unicode strings
tfidf = TfidfVectorizer()
tfidf.fit(df['Parsed_Ingredients']) #Basically makes the vectorizer go through the column and make a list of unique words
tfidf_recipes = tfidf.transform(df['Parsed_Ingredients']) #transforms the data 


saving the model so we dont have to execute it again and again

In [19]:
with open('C:/Users/ACER/Desktop/Recipe_Recommendation/jupyter notebooks/trained_model.pickle', "wb") as f:
    pickle.dump(tfidf, f)

with open('C:/Users/ACER/Desktop/Recipe_Recommendation/jupyter notebooks/encoding.pickle', "wb") as f:
    pickle.dump(tfidf_recipes, f)