# This file contains code to find recipes that are similar to other recipes in our database

## Load the recipes database

In [None]:
# Import pandas
import pandas as pd

In [None]:
RD = pd.read_pickle('./Recipes_DataFrame.pkl')
# Look at the database
RD.head()

In [None]:
# convert ingredients and instructions to lower case
RD['Ingredients and Instructions'] = RD['Ingredients and Instructions'].\
                                        apply(lambda row: row.lower())
RD.head()

import string
# remove punctuations using the character deletion step of translate
RD['Ingredients and Instructions'] = RD['Ingredients and Instructions'].\
apply(lambda row: row.translate(None, string.punctuation))
RD.head()

## Calculate tf-idf (term frequency - inverse document frequency)
Tf-idf reflects how important a word (ingredient or instruction) is in a recipe compared to all other recipes

In [None]:
# import some modules for calculating tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

In [None]:
# write a function that will tokenize and stem words
stemmer = PorterStemmer()
def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for token in tokens:
        stems.append(stemmer.stem(token))
    return stems

In [None]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')

# Obtain a matrix with tf-idfs for different recipes. 
# The columns of the matrix will contain features, 
# which are the weighted frequency of tokens
tfidf_mat = tfidf.fit_transform(RD['Ingredients and Instructions'])

## Remove rows that are entirely zero

In [None]:
import numpy as np
# remove rows that are entirely zero from the tfidf matrix and also from recipes names
num_nonzeros =  np.diff(tfidf_mat.indptr)
zeros_ind = np.nonzero(num_nonzeros == 0)[0][0]
tfidf_mat =  tfidf_mat[num_nonzeros != 0]

#Also remove those recipes from the recipes databse
RD = RD.drop(RD.index[zeros_ind])

## Calculate the cosine similarity between recipes

In [None]:
# import module to calculate distances between vectors
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# similarity[i,j] will store how far recipe i is from recipe j
similarity = cosine_similarity(tfidf_mat)

## Look at a couple of examples of similar recipes

In [None]:
key_lime_bars = RD[RD['Name'] == 'Key Lime Bars'].index[0]
key_lime_cookies = RD[RD['Name'] == 'Key Lime Cookies'].index[0]
brussels_sprouts = RD[RD['Name'] == 'Buffalo Brussels Sprouts'].index[0]

# for sanity check, make sure that a recipe is 100% similar to itself
print "Similarity between key lime bars and itself = ", \
similarity[key_lime_bars, key_lime_bars]
print "Similarity between key lime cookies and itself = ", \
similarity[key_lime_cookies, key_lime_cookies]
print "Similarity between buffalo brussels sprouts and itself = ", \
similarity[brussels_sprouts, brussels_sprouts]

# also check how similar these are to one another
print "Similarity between key lime bars and key lime cookies = ", \
similarity[key_lime_bars, key_lime_cookies]
print "Similarity between key lime bars and buffalo brussels sprouts = ", \
similarity[key_lime_bars, brussels_sprouts]
print "Similarity between key lime cookies and buffalo brussels sprouts = ", \
similarity[key_lime_cookies, brussels_sprouts]

## Now find the five most similar recipes to given recipes

In [None]:
# import nearest neighbors module
from sklearn.neighbors import NearestNeighbors

num_interest = 3
# initialize the model
neigh = NearestNeighbors(n_neighbors = num_interest, algorithm='auto')
# fit the model
neigh.fit(tfidf_mat)
nearest_neighbors = neigh.kneighbors(return_distance=False)

In [None]:
query_names = ['Bacon Cheese Puff Balls', 'Mexican Flan', \
               'Spicy Buffalo Wings', 'Key Lime Bars', \
              'Key Lime Cookies', 'Cranberry Margarita']
closest_recipes = []
for recipe in query_names:
    query_ind = RD[RD['Name'] == recipe].index[0]
    curr_recipe_close_recipes = []
    for i in xrange(num_interest):
        curr_ind = nearest_neighbors[query_ind,i]
        curr_recipe_close_recipes.append(RD['Name'][curr_ind])
    
    closest_recipes.append(curr_recipe_close_recipes)

In [None]:
# display this data in a table
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig, ax = plt.subplots(1,1)
ax.axis('off')
colLabels =xrange(1,num_interest+1)
table = ax.table(cellText=closest_recipes, rowLabels=query_names,\
            colLabels=colLabels, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(200)
table.scale(50, 50)

# Classification

The website "Gourmetsleuth.com" has categorized these recipes into 9 different courses. The dataframe we created stores these categories in the column 'Classification'. Let's see if we can implement a learning algorithm to classify these recipes.

In [None]:
RD.tail()

In [None]:
# Look at the first few entries
RD.head()

In [None]:
# The array below lists the categories
categories = ['Appetizers and Snacks', 'Bread, Muffins and Rolls', 'Breakfast and Brunch', 
                'Desserts', 'Drinks and Beverages', 'Main Dishes', 'Salads and Dressings',
             'Sides', 'Soups, Stews and Chili']

## Lets visualize the data first

In [None]:
from sklearn import decomposition

In [None]:
X = tfidf_mat
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(X.toarray())
Y = RD['Classification'].values
fig = plt.figure(1, figsize=(10, 4))
plt.clf()
plt.scatter(X[:,0], X[:,1], c=Y, cmap=plt.cm.spectral)

In [None]:
X = tfidf_mat
pca = decomposition.PCA(n_components=3)
X = pca.fit_transform(X.toarray())
Y = RD['Classification'].values
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(10, 4))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=15, azim=210)
scatplot = ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=Y, cmap=plt.cm.spectral)


## Fit a logistic classifier

In [None]:
X = tfidf_mat
Y = RD['Classification'].values
# first shuffle the entries
num_recipes = X.shape[0]
ind = np.random.permutation(num_recipes)

# separate into training and test set
train_num = int(round(.9*num_recipes))
train_ind = ind[:train_num]
test_ind = ind[train_num:]

train_X = X[train_ind,:]
train_Y = Y[train_ind]

test_X = X[test_ind,:]
test_Y = Y[test_ind]

print "num_train_data = ", len(train_Y)
print "num_test_data = ", len(test_Y)

In [None]:
from sklearn import linear_model
# create a logistic regression classifier that uses a cross-validation set to
# find the optimum C parameter
logreg = linear_model.LogisticRegressionCV(solver='liblinear', penalty='l1')

In [None]:
# fit the training data
logreg.fit(train_X, train_Y)

In [None]:
# Predict the classification on the test data
expected = test_Y
predicted = logreg.predict(test_X)

In [None]:
from sklearn import metrics
print("Classification report for classifier %s:\n%s\n"
     % (logreg, metrics.classification_report(expected, predicted)))
print ("The classification categories are as follows:")
num_categories = len(categories)
for i in xrange(num_categories):
    print ("%d %s" % (i, categories[i]))
print("\nConfusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

# Plot the classification result

In [None]:
# project the result to 2D
diff_results = 20*np.ones(test_Y.shape)
diff_results[expected != predicted] = 0
pca = decomposition.PCA(n_components=2)
test_X_2d = pca.fit_transform(test_X.toarray())
fig = plt.figure(1, figsize=(10, 4))
plt.clf()
correct = plt.scatter(test_X_2d[expected==predicted,0], test_X_2d[expected==predicted,1], c='white')
incorrect = plt.scatter(test_X_2d[expected!=predicted,0], test_X_2d[expected!=predicted,1],
                        marker='x', c='red')
plt.legend((correct, incorrect), ('correctly classified', 'incorrectly classified'),
           loc='lower left')