In [6]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
import statistics
import itertools

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection  import train_test_split, ShuffleSplit
from similarity_functions     import asymmetric_cosine, jaccard 

import surprise  # Package called 'scikit-surprise'

%matplotlib inline

In [100]:
class MatrixFactorization():
    """A basic rating prediction algorithm based on matrix factorization using SVD.
    
    """
    
    def __init__(self, learning_rate, n_epochs, n_factors, random_state = None):
        
        self.lr = learning_rate    # learning rate for SGD.
        self.n_epochs = n_epochs   # Number of iterations of SGD.
        self.n_factors = n_factors # Number of factors.
        self.random_state = random_state
        
        
    def train(self, data):
        """Learn the decomposition vectors p_u and q_i with Stochastic Gradient Descent (SGD).
        
        """
        
        self.n_users = data.shape[0]
        self.n_items = data.shape[1]
        
        if self.random_state != None:
            np.random.seed(self.random_state)
        
        # Randomly initialize the user and item factors.
        p = np.random.normal(0, .1, (self.n_users, self.n_factors))
        q = np.random.normal(0, .1, (self.n_items, self.n_factors))
        
        # SGD.
        for _ in range(self.n_epochs):
            for u, i in zip(range(self.n_users), range(self.n_items)):
                
                # Treat non-present ingredients as missing values.
                if data.iloc[u, i] == 0:
                    break
                else:
                    r_ui = data.iloc[u, i]
                
                err = r_ui - np.dot(p[u], q[i])
                
                # Update vectors p_u and q_i.
                p[u] += self.lr * err * q[i]
                q[i] += self.lr * err * p[u]
                # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.
                # In practice it makes almost no difference.
        
        self.p, self.q = p, q
        self.data = data

        
    def recommend(self, n_recommendations):
        """For all recipes calculate the estimated ingredient scores and return the top
        *n_recommendations* best ingredients.
        
        """
        
        scores = np.dot(self.p, self.q.T)
        
        # Set ingredient scores of already present ingredients to zero.
#         scores[self.data == 1] = 0
        
        # For each recipe, get the indices of the *n_recommendations* highest-scoring ingredients in order.
        recommendations_idx = np.argsort(-scores, axis = 1)[:, :n_recommendations]

        # Convert recommendation indices to ingredient names.
        recommendations = self.data.columns[recommendations_idx]

        return recommendations

In [59]:
# Load data.
X = pd.read_csv('train_dataset.csv', sep = '\t', index_col = 0).drop('cuisine', axis = 1)
cuisines = pd.read_csv('train_dataset.csv', sep = '\t', usecols = ['cuisine'])
cuisines.set_index(X.index, inplace = True)

In [114]:
algo = MatrixFactorization(learning_rate = .001, n_epochs = 5000000, n_factors = 15, random_state = 1)
algo.train(X)
recommendations = algo.recommend(n_recommendations = 10)

In [113]:
# Print some recipe + recommendation samples.
recipe1 = 7
recipe2 = 12
recipe3 = 10

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe1], 
                                      cuisines.iloc[recipe1, 0],
                                      X.columns[np.where(X.iloc[recipe1,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe1,]))

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe2], 
                                      cuisines.iloc[recipe2, 0],
                                      X.columns[np.where(X.iloc[recipe2,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe2,]))

print('Recipe {} ({}): \n{}\n'.format(X.index[recipe3],
                                      cuisines.iloc[recipe3, 0],
                                      X.columns[np.where(X.iloc[recipe3,] != 0)]))
print('Recommended ingredients: \n{}\n\n'.format(recommendations[recipe3,]))

Recipe 16903 (mexican): 
Index(['cheddar_cheese', 'jalapeno_chilies', 'lettuce', 'lime', 'pork',
       'purple_onion', 'peppers', 'olive', 'cilantro_chopped_fresh',
       'pepper_ground_black', 'tortillas_corn'],
      dtype='object')

Recommended ingredients: 
Index(['zucchini', 'potatoes', 'paprika', 'mustard_seeds', 'tomatoes',
       'monterey_jack', 'large_eggs', 'basil_dried', 'cumin_seed',
       'crumbs_bread'],
      dtype='object')


Recipe 41995 (mexican): 
Index(['avocado', 'flank_steak', 'garlic', 'ground_cumin', 'kosher_salt',
       'onions', 'tomatoes', 'olive', 'pepper_black', 'powder_chili',
       'pepper_ground_black', 'crushed_pepper_flakes', 'cilantro_fresh',
       'cinnamon_ground', 'coriander_ground', 'juice_lime'],
      dtype='object')

Recommended ingredients: 
Index(['bacon', 'cabbage_green', 'tomato_paste', 'garlic_minced',
       'pepper_sauce', 'vegetable_cooking_spray', 'sugar_white', 'chives',
       'juice_lemon', 'grated_nutmeg'],
      dtype='obje