This is my first attempt in building Word2Vec model, with inspiration from "What's Cooking" challenge on Kaggle.
If you like it please upvote. Expert reviews/comments or suggestions are welcomed.

In [None]:
import pandas as pd
import numpy as np
import collections
import gensim 
from gensim.models import word2vec, phrases
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_numeric,\
                    strip_non_alphanum, strip_multiple_whitespaces, strip_short
from textblob import TextBlob, Word

import re
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
%matplotlib inline


In [None]:
#load the recipes dataset
filepath = "/kaggle/input/foodrecipes/recipes.csv"
df_recipes = pd.read_csv(filepath, encoding="ISO-8859-1")

#drop rows where cuisine, ingregients are NA
df_recipes.dropna(subset=['cuisine', 'ingredients'],inplace=True)
df_recipes

In [None]:
#convert to lower case
df_recipes['ingredients'] = df_recipes['ingredients'].apply(lambda x: x.lower())

total_ingredients = []
all_receipes_ingredients =  []

for i in range(len(df_recipes)):
    all_ingredients = list()
    
    #split each recipe into different ingredients
    ingred = df_recipes.loc[i, "ingredients"][1:-1]
    
    for ing in (ingred.split(',')):
        ing = remove_stopwords(ing)
        ing = strip_numeric(ing)
        ing = re.sub(r'\(.*oz.\)|(®)|(.*ed)|(.*ly)|boneless|skinless|chunks|fresh|large|cook drain|green|frozen|ground','',ing).strip()
        ing = strip_short(ing,2)
        ing = strip_multiple_whitespaces(ing)
        ing = strip_punctuation(ing)
        ing = strip_non_alphanum(ing)
        
        #convert plurals to singular e.g. tomatoes --> tomato
        ing = (" ".join(TextBlob(ing).words.singularize()))
        
        all_ingredients.append(ing)
        total_ingredients.append(ing)
        
    all_receipes_ingredients.append(all_ingredients)
    
counts_ingr = collections.Counter(total_ingredients)

print('Total Ingredients (with repetition):  \t{}'.format((len(total_ingredients))))
print('Unique Ingredients : \t\t\t{}'.format((len(counts_ingr.values()))))
print('Total Receipes:  \t\t\t{}'.format((len(all_receipes_ingredients))))


In [None]:
#add cleaned ingredients back to original dataframe
df_recipes['clean_ingredients'] = pd.Series(all_receipes_ingredients)

#record the number of ingredients for each recipe
df_recipes['ingredient_count'] =  df_recipes.apply(lambda row: len(row['clean_ingredients']), axis = 1)

#convert time in seconds to minutes
df_recipes['timeMins'] = df_recipes.totalTimeInSeconds.apply(lambda x: x/60) 

In [None]:
print(np.mean(list(counts_ingr.values())))
print(np.std(list(counts_ingr.values())))
print(np.median(list(counts_ingr.values())))
print(np.percentile(list(counts_ingr.values()), [25., 50., 75., 99.]))

In [None]:
#find the most common ingredients used across all recipes
print ("---- Most Common Ingredients ----")
print (counts_ingr.most_common(10))

print ("\n")

#find the most common ingredients used across all recipes
print ("---- Least Common Ingredients ----")
print (counts_ingr.most_common()[-10:])


In [None]:
#visualize the ingredients in WordCloud 

from wordcloud import WordCloud

def plot_wordcloud(text, title=None, max = 1000, size=(12,8), title_size=16):
    """plots wordcloud"""
    wordcloud = WordCloud(max_words=max).generate(text)
    plt.figure(figsize=size)
    plt.title(title, size=title_size)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")

plot_wordcloud(' '.join(total_ingredients), title='Ingredients')

In [None]:
#Train the Word2Vec model

num_features = 300    # Word vector dimensionality                      
min_word_count = 4                        
num_workers = 4       # Number of CPUs
context = 10          # Context window size; 

downsampling = 1e-2   # threshold for configuring which 
                      # higher-frequency words are randomly downsampled
            
# Initialize and train the model 
model = word2vec.Word2Vec(all_receipes_ingredients, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context,sample = downsampling, iter=20)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

In [None]:
# check the similar ingredients returned by the model for search_terms

similar_words = {search_term: [item[0] for item in model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['paneer','egg','mango','bread', 'rice']}
similar_words

In [None]:
#visualization with Tsne
from sklearn.manifold import TSNE

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = model.wv[words]
tsne = TSNE(n_components=2, random_state=0, n_iter=1000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
model.wv.most_similar_cosmul(positive=['paneer', u'chicken'], negative=['tomato sauce'])

In [None]:
model.wv.most_similar('chocolate')

In [None]:
model.wv.most_similar('mayonnaise')

In [None]:
model.wv.most_similar('chicken')

In [None]:
model.wv.similarity('paneer', 'chicken')

In [None]:
model.wv.doesnt_match("chicken paneer lentils meat".split())

In [None]:
model.wv.similarity('chocolate', 'cream')

In [None]:
x = 'bread'
b= 'cheese'
a = 'bread'
predicted = model.wv.most_similar([x, b], [a])[0][0]
print(" {} is to  {} as {} is to {} ".format(a, b, x, predicted))

In [None]:
plt.style.available
plt.style.use('ggplot')

plt.xlabel('Cooking Time (minutes)')
plt.ylabel('Recipe Count')

labels = ['4', '3', '5', '0', '2', '1']
plt.pie(df_recipes.rating.value_counts(), labels=labels, autopct='%1.1f%%', colors=['gold', 'green', 'lightcoral', 'lightskyblue', 'red'], startangle=50, pctdistance=0.6)
plt.axis('equal')
plt.title('Recipes by Rating')
plt.show()

In [None]:
# ingredient/rating relationship?

feature_cols = ['ingredient_count']
X= df_recipes[feature_cols]
y= df_recipes.rating
plt.scatter(X, y)
plt.xlabel('Ingredients')
plt.ylabel('Yummly Rating')
plt.title('Ingredient Counts vs Ratings')

In [None]:
feature_cols = ['timeMins']
X= df_recipes.dropna()[feature_cols]
y= df_recipes.dropna().rating
plt.scatter(X, y)
plt.xlabel('Time in Minutes')
plt.ylabel('Yummly Rating')
plt.title('Cooking Time vs Ratings')


In [None]:
# ingredients/cooking time relationship?
feature_cols = ['ingredient_count']
X= df_recipes.dropna()[feature_cols]
y= df_recipes.dropna()['timeMins']
plt.scatter(X, y)
plt.xlabel('Ingredient Count')
plt.ylabel('Time in Minute')
plt.title('Ingredients vs. Cooking Times')


In [None]:
df_indian = df_recipes[df_recipes.cuisine.str.contains("indian", case=False)].copy()

ingredients =df_indian["clean_ingredients"].sum()

counts = collections.Counter(ingredients)

counts.most_common(30)
