In [36]:
import os

#core dependencies
import pandas as pd
import numpy as np
import json
import re


#sklearn machine learning libraries
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from IPython.display import display, HTML
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud

#visualizations
import matplotlib
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#LDA Libraries
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import spacy

#Visualizations
import pyLDAvis
import pyLDAvis.sklearn




# 1. Load Data

In [37]:
#Function for loading and reading files
def load_data(lists_folder='data/lists/'):
    
    # Load the basic information on the recipes
    df = None
    for file in os.listdir(lists_folder):
        if 'DS_' in file:
            continue
        file_path = os.path.join(lists_folder, file)
        with open(file_path) as sd:
            data = json.load(sd)['matches']
        cdf = pd.DataFrame(data)
        cdf['cuisine'] = file.split('.')[0].split('_')[-2]
        df = (cdf if df is None else pd.concat([df, cdf]))
    df = df[['flavors', 'id', 'ingredients', 'recipeName', 'cuisine']]

    return df

In [38]:
#Simple function that lemmatizes a description
scy = spacy.load("en_core_web_sm")
def desc_prepro(desc):
    doc = scy(desc)
    words = [token.lemma_ for token in doc]
    return ' '.join(words)

In [39]:
df = load_data()

#  2. Data Transformation

In [40]:
#drop dupliate recipes

df.drop_duplicates('id', inplace = True)
df.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine
0,"{'piquant': 0.6666666666666666, 'meaty': 0.166...",Easy-Seafood-Paella-2077829,"[lobster tails, water, olive oil, yellow onion...",Easy Seafood Paella,spanish
1,,Skillet-Grilled-Seafood-and-Chorizo-Paella_-21...,"[olive oil, sweet onion, garlic, spanish chori...",Skillet Grilled Seafood and Chorizo Paella.,spanish
2,"{'piquant': 0.8333333333333334, 'meaty': 0.833...",Spanish-style-garlic-shrimp-298317,"[I Can't Believe It's Not Butter!® Spread, unc...",Spanish-style Garlic Shrimp,spanish
3,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Classic-Paella-898818,"[extra-virgin olive oil, skinless chicken brea...",Classic Paella,spanish
4,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",One-Pot-Spanish-Chicken-and-Rice-2237989,"[olive oil, chicken thighs, salt, pepper, onio...",One Pot Spanish Chicken and Rice,spanish


In [41]:
#need to drop nulls for pca on flavors later
df['flavors'].isnull().value_counts()

False    11533
True      5053
Name: flavors, dtype: int64

In [42]:
#dropping recipes without flavor information
df = df.dropna(subset=['flavors'], axis = 0).reset_index( drop = True)
df['flavors'].isnull().value_counts()

False    11533
Name: flavors, dtype: int64

In [43]:
#need to seperate flavors into seperate features
df['flavors'][0]

{'piquant': 0.6666666666666666,
 'meaty': 0.16666666666666666,
 'bitter': 0.3333333333333333,
 'sweet': 0.16666666666666666,
 'sour': 0.16666666666666666,
 'salty': 0.16666666666666666}

In [44]:
#creating seperate columns for flavors
df['piquant'] = df['flavors'].apply(lambda x: x['piquant'])
df['meaty'] = df['flavors'].apply(lambda x: x['meaty'])
df['bitter'] = df['flavors'].apply(lambda x: x['bitter'])
df['sweet'] = df['flavors'].apply(lambda x: x['sweet'])
df['sour'] = df['flavors'].apply(lambda x: x['sour'])
df['salty'] = df['flavors'].apply(lambda x: x['salty'])

#  3. NLP and Machine Learning Prep

In [45]:
#Funtion that cleans strings for mlp
def nlp_ml_prep(string):
    #remove digits
    new_string = re.sub(r'\b\d+\b', '', string)
    #remove punctuation
    new_string = re.sub(r'[^\w\s]','',new_string)
    #set all to lower case
    new_string = new_string.lower()
    
    #need to standardize ingredients, some words like lowfat, reduced fat, black in black pepper etc need to be removed
    stopwords =['gluten free', 'sugar free', 'sugarfree','low fat', 'reduced fat', 'fat free', 'fatfree', 'nonfat',
         'reduced sodium', 'low sodium','salt free','sodium free', 'sweetened','unsweetened','large','extra large','oz','free range',
         'black','yellow','skinless','fresh','chopped','diced','sliced', 'uncooked', 'peeled', 'uncook','peel', 'and', 'or', 'devein', 'deveined',
          'medium','extravirgin', 'extra', 'virgin', 'extra virgin', 'brown', 'white','all-purpose','allpurpose', 'all purpose','clove','cold','hot','red']
    for word in stopwords:
        new_string = new_string.replace(word,'')
    
    #trim leading and trailing white spaces
    new_string = new_string.strip()
    
    #join whitespace into underscore, for two word ingredients such as olive oil => olive_oil
    new_string = re.sub(' ','_',new_string)
    
    return new_string

In [46]:
#applying cleaning function to ingredients column
df.ingredients = df.ingredients.apply(lambda x: list(map(lambda y: nlp_ml_prep(y), x)))

In [47]:
df.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,piquant,meaty,bitter,sweet,sour,salty
0,"{'piquant': 0.6666666666666666, 'meaty': 0.166...",Easy-Seafood-Paella-2077829,"[lobster_tails, water, olive_oil, onion, spani...",Easy Seafood Paella,spanish,0.666667,0.166667,0.333333,0.166667,0.166667,0.166667
1,"{'piquant': 0.8333333333333334, 'meaty': 0.833...",Spanish-style-garlic-shrimp-298317,"[i_cant_believe_its_not_butter_spread, shrimp,...",Spanish-style Garlic Shrimp,spanish,0.833333,0.833333,0.833333,0.166667,0.666667,0.833333
2,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Classic-Paella-898818,"[olive_oil, chicken_breasts, chicken_stock, sa...",Classic Paella,spanish,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",One-Pot-Spanish-Chicken-and-Rice-2237989,"[olive_oil, chicken_thighs, salt, pepper, onio...",One Pot Spanish Chicken and Rice,spanish,0.166667,0.166667,0.166667,0.166667,0.833333,0.166667
4,"{'piquant': 0.5, 'meaty': 0.16666666666666666,...",Portuguese-fish-stew-306395,"[plum_tomatoes, savoy_cabbage, olive_oil, chiz...",Portuguese Fish Stew,spanish,0.5,0.166667,0.166667,0.166667,0.5,0.333333


In [48]:
X = df.iloc[0:,5:11].values #extracting flavors

In [49]:
from sklearn.metrics import pairwise_distances

In [50]:
#Function to find pairwise distances of one recipe and return the most similar recipes
#Takes in indexes of recipe of interest

def get_similiar_flavor_recipes(recipe,number):
    X = df.iloc[0:,5:11].values
    dist_recipe = pairwise_distances(X, X[recipe].reshape(1,-1))
    matches = np.argsort(dist_recipe.transpose()[0], axis = -1)
    return df.iloc[matches[:number]].drop(recipe,axis = 0)

In [16]:
get_similiar_flavor_recipes(2,-1)

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,piquant,meaty,bitter,sweet,sour,salty
6620,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Stuffed-shells-308656,"[lemon, olive_oil, crushed__pepper_flakes, fin...",Stuffed Shells,italian,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3747,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Easy-Pumpkin-Masala-1532616,"[pumpkin, small_potatoes, ginger, olive_oil, g...",Easy Pumpkin Masala,indian,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3416,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Weeknight-Tikka-Masala-622193,"[oil, onions, garlic, chicken_pieces, smoked_p...",Weeknight Tikka Masala,indian,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3999,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Grilled-steak-and-tomato-salad-with-rum-vinaig...,"[flank_steak, olive_oil, salt, ly_ground_peppe...",Grilled Steak and Tomato Salad with Rum Vinaig...,american,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
6407,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Pasta-AllArrabbiata-1035521,"[olive_oil, garlic, pepper_flakes, plum_tomato...",Pasta All’Arrabbiata,italian,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3966,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Mushroom_-Spinach-_-Chickpea-Curry-1230721,"[rapeseed_oil, onions, ginger, curry_powder, m...","Mushroom, Spinach & Chickpea Curry",indian,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
2018,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Irish-Lamb-Stew-895797,"[leg_of_lamb, potatoes, leeks, carrots, celery...",Irish Lamb Stew,irish,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
266,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Guiso-de-pulpo-324966,"[onions, garlic, bay_leaves, sweet_pepper, ita...",Guiso de Pulpo,spanish,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
10107,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Seafood-Pasta-_portuguese-Style_-1477561,"[shrimp, clams, onions, garlic, salt, cier, to...",Seafood Pasta (portuguese Style),portuguese,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3516,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Slow-Cooker-Chicken-Tikka-Masala-2262252,"[ginger, garlic, tomato_passata, coconut_milk,...",Slow Cooker Chicken Tikka Masala,indian,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667


In [17]:
dist_recipe_3 = pairwise_distances(X, X[3].reshape(1,-1)) #respective distance for each reciepe in regards to X[3] can use cosine distances 

In [18]:
dist_recipe_3[0] #need to flatten this array

array([0.84983659])

In [19]:
dist_recipe_3.transpose()[0]

array([0.84983659, 1.34370962, 0.23570226, ..., 0.95742711, 0.89752747,
       1.01379376])

In [20]:
matches = np.argsort(dist_recipe_3.transpose()[0], axis = -1) #returns index of the dishes with the closest distance
#untransposed distances gives you a row of distances for each X

In [21]:
#make list of ingredients into a single string to be analyzed
ingredient_strings =[]

for ingredients in df['ingredients']:
    ingredients = ' '.join(ingredients)
    ingredient_strings.append(ingredients)

In [22]:
df['nlp_ingredients'] = ingredient_strings



In [23]:
#Lemmmatizing Process -- for futher standardization and NLP preperation. Takes about 7 Minutes
df['nlp_ingredients'] = df['nlp_ingredients'].map(desc_prepro)


In [53]:
#drop duplicate recipes

df.drop_duplicates(subset = 'id', inplace = True)

In [54]:
df.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,piquant,meaty,bitter,sweet,sour,salty
0,"{'piquant': 0.6666666666666666, 'meaty': 0.166...",Easy-Seafood-Paella-2077829,"[lobster_tails, water, olive_oil, onion, spani...",Easy Seafood Paella,spanish,0.666667,0.166667,0.333333,0.166667,0.166667,0.166667
1,"{'piquant': 0.8333333333333334, 'meaty': 0.833...",Spanish-style-garlic-shrimp-298317,"[i_cant_believe_its_not_butter_spread, shrimp,...",Spanish-style Garlic Shrimp,spanish,0.833333,0.833333,0.833333,0.166667,0.666667,0.833333
2,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Classic-Paella-898818,"[olive_oil, chicken_breasts, chicken_stock, sa...",Classic Paella,spanish,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",One-Pot-Spanish-Chicken-and-Rice-2237989,"[olive_oil, chicken_thighs, salt, pepper, onio...",One Pot Spanish Chicken and Rice,spanish,0.166667,0.166667,0.166667,0.166667,0.833333,0.166667
4,"{'piquant': 0.5, 'meaty': 0.16666666666666666,...",Portuguese-fish-stew-306395,"[plum_tomatoes, savoy_cabbage, olive_oil, chiz...",Portuguese Fish Stew,spanish,0.5,0.166667,0.166667,0.166667,0.5,0.333333


In [55]:
df.shape

(11533, 11)

In [51]:
df.iloc[4]

flavors        {'piquant': 0.5, 'meaty': 0.16666666666666666,...
id                                   Portuguese-fish-stew-306395
ingredients    [plum_tomatoes, savoy_cabbage, olive_oil, chiz...
recipeName                                  Portuguese Fish Stew
cuisine                                                  spanish
piquant                                                      0.5
meaty                                                   0.166667
bitter                                                  0.166667
sweet                                                   0.166667
sour                                                         0.5
salty                                                   0.333333
Name: 4, dtype: object

In [26]:
#saving dataframe to csv
df.to_csv('nlp_ml_ready_recipes.csv')

#  4. Pairwise Distance on Flavors

In [27]:
df.head()

Unnamed: 0,flavors,id,ingredients,recipeName,cuisine,piquant,meaty,bitter,sweet,sour,salty,nlp_ingredients
0,"{'piquant': 0.6666666666666666, 'meaty': 0.166...",Easy-Seafood-Paella-2077829,"[lobster_tails, water, olive_oil, onion, spani...",Easy Seafood Paella,spanish,0.666667,0.166667,0.333333,0.166667,0.166667,0.166667,lobster_tail water olive_oil onion spanish_ric...
1,"{'piquant': 0.8333333333333334, 'meaty': 0.833...",Spanish-style-garlic-shrimp-298317,"[i_cant_believe_its_not_butter_spread, shrimp,...",Spanish-style Garlic Shrimp,spanish,0.833333,0.833333,0.833333,0.166667,0.666667,0.833333,i_cant_believe_its_not_butter_spread shrimp sa...
2,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",Classic-Paella-898818,"[olive_oil, chicken_breasts, chicken_stock, sa...",Classic Paella,spanish,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667,olive_oil chicken_breast chicken_stock saffron...
3,"{'piquant': 0.16666666666666666, 'meaty': 0.16...",One-Pot-Spanish-Chicken-and-Rice-2237989,"[olive_oil, chicken_thighs, salt, pepper, onio...",One Pot Spanish Chicken and Rice,spanish,0.166667,0.166667,0.166667,0.166667,0.833333,0.166667,olive_oil chicken_thighs salt pepper onion gar...
4,"{'piquant': 0.5, 'meaty': 0.16666666666666666,...",Portuguese-fish-stew-306395,"[plum_tomatoes, savoy_cabbage, olive_oil, chiz...",Portuguese Fish Stew,spanish,0.5,0.166667,0.166667,0.166667,0.5,0.333333,plum_tomatoe savoy_cabbage olive_oil chizo_sau...


In [28]:
df.columns

Index(['flavors', 'id', 'ingredients', 'recipeName', 'cuisine', 'piquant',
       'meaty', 'bitter', 'sweet', 'sour', 'salty', 'nlp_ingredients'],
      dtype='object')

In [29]:
#create a new dataframe with just recipe name cuisine and flavors

df_flavors = df.drop(['flavors', 'id', 'ingredients', 'nlp_ingredients'], axis = 1)

In [30]:
df_flavors.head()

Unnamed: 0,recipeName,cuisine,piquant,meaty,bitter,sweet,sour,salty
0,Easy Seafood Paella,spanish,0.666667,0.166667,0.333333,0.166667,0.166667,0.166667
1,Spanish-style Garlic Shrimp,spanish,0.833333,0.833333,0.833333,0.166667,0.666667,0.833333
2,Classic Paella,spanish,0.166667,0.166667,0.333333,0.166667,0.666667,0.166667
3,One Pot Spanish Chicken and Rice,spanish,0.166667,0.166667,0.166667,0.166667,0.833333,0.166667
4,Portuguese Fish Stew,spanish,0.5,0.166667,0.166667,0.166667,0.5,0.333333


In [31]:
df_flavors.to_csv('name_cuisine_flavors.csv')

In [32]:
#Find most simliar recipe based on flavor profile
#Designate Data
X = df_flavors

# 5. Creating CSV for Topic Modeling

In [33]:
#create new data frame with just ingredients ready for nlpn name, and cuisine
df_ing = df.drop(['flavors', 'id', 'ingredients', 'piquant',
       'meaty', 'bitter', 'sweet', 'sour', 'salty'], axis = 1)

In [34]:
df_ing.head()

Unnamed: 0,recipeName,cuisine,nlp_ingredients
0,Easy Seafood Paella,spanish,lobster_tail water olive_oil onion spanish_ric...
1,Spanish-style Garlic Shrimp,spanish,i_cant_believe_its_not_butter_spread shrimp sa...
2,Classic Paella,spanish,olive_oil chicken_breast chicken_stock saffron...
3,One Pot Spanish Chicken and Rice,spanish,olive_oil chicken_thighs salt pepper onion gar...
4,Portuguese Fish Stew,spanish,plum_tomatoe savoy_cabbage olive_oil chizo_sau...


In [35]:
df_ing.to_csv('name_cuisine_nlptopicmodeling.csv')