In [1]:
#--- load required packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

#--- for reproducability
random_state = 0

#--- load data
ing_mat = loadmat('MATLAB/ingredients.mat')['ingredients']
cityDist_mat = loadmat('MATLAB/citiesDistMat.mat')['citiesDistMat']
labelName_mat = loadmat('MATLAB/labelNames.mat')['labelNames']
labels_mat = loadmat('MATLAB/labels.mat')['labels']
recipe_mat = loadmat('MATLAB/recipes.mat')['recipes']

#--- for colnames
ing_headline = []
for i in ing_mat[0]:
    ing_headline.append(i[0])
#--- create data matrices
#--- NOTE: below I am transforming panda object to numpy representation with .values
#--- Needed to do this for dataloader later on
dataset_X = pd.DataFrame(recipe_mat,columns=ing_headline).values #predictors
dataset_y = pd.DataFrame(labels_mat,columns=['label']).values #labels
X_train_full,X_test,y_train_full,y_test = train_test_split(dataset_X,dataset_y,
                                                            test_size=0.2,
                                                            random_state = random_state) #train test split
X_train,X_val,y_train,y_val = train_test_split(X_train_full,y_train_full,
                                                test_size=0.25,
                                                random_state = random_state) #train val split
X_train_len = len(X_train)

#--- easier to understand
ingredients = ing_headline

##--- additionally have cuisines as list
#cuisines = []
#for n in range(0, 12, 1):
#    idx = dataset_y.index[dataset_y.label == n+1]
#    cuisines.append(labelName_mat[idx[0]][0].item())

#--- how to get index of rows corresponding to one cuisine
# dataset_y.index[dataset_y.label == 1]

# using this quite often, could have just created a list
# with the correspinding indices, will leave it like it is for now

In [2]:
#--- now data looks like this compared to the table with column names
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [3]:
#--- loading required packages

from tqdm import tqdm
from torchvision import transforms
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import torch
import torch.optim as optim
from torch.autograd import Variable


In [4]:
#--- adding noise to noise_level% of a recipe, 
#--- i.e. flip 0's to 1's and the other way arround
def add_noise(batch, noise_level=0.3):
    with torch.no_grad():
        noise = torch.rand(batch.shape) < noise_level
        return torch.logical_xor(batch, noise).to(torch.float32)


In [5]:
#--- denoising autoencoder
class AutoEncoder(nn.Module):
    def __init__(self, input_shape=709):
        super(AutoEncoder, self).__init__()
        self.layerE0 = nn.Linear(709, 709) #encoder
        self.layerD0 = nn.Linear(709, 709) #decoder
    
    #--- define forward pass    
    def forward(self, x):
        # encode 
        x = nn.functional.relu(self.layerE0(x))
        
        # decode
        x = self.layerD0(x) #apply sigmoid later, see below
        
        return x

In [6]:
#--- data loader
X_train_tensor = torch.tensor(X_train).to(torch.float32)
X_val_tensor = torch.tensor(X_val).to(torch.float32)
#--- batch size to be tuned
dataloader = torch.utils.data.DataLoader(X_train_tensor, batch_size=16, shuffle=True)

In [7]:
#--- just a reminder, average number of recipes
X_train_tensor.sum(axis=-1).mean()

tensor(10.8225)

In [8]:
#--- training model

#--- since only 10-11 ingredients on average, only add noise to 0.5% of the data,
#--- which corresponds to about 3 ingredients changing.
#--- this is a hyperparameter to be tuned! (add noise to more or less ingredients)

model = AutoEncoder(input_shape=709)
optim = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(50):
    for i, batch in enumerate(dataloader):
        noisy_batch = add_noise(batch, noise_level=0.005) #0.5%
        model.zero_grad()
        
        # apply sigmoid as mentioned before
        # so that output is between 0 and 1
        pred = torch.sigmoid(model(noisy_batch))
        
        # don't know which loss makes most sense here, 
        # learning with both
        loss = nn.MSELoss()(pred, batch)
        #loss = nn.L1Loss()(pred, batch)
        
        loss.backward()
        optim.step()
    #print(f"epoch:{epoch}\tloss:{loss.item():.04f}")
    
    #print(nn.MSELoss()(noisy_batch, batch).item())

In [12]:
#--- just checking if add_noise is working as expected
#--- when noise_level=.0, then still identical to original x
(add_noise(X_train_tensor, noise_level=.0) == X_train_tensor).all()

tensor(True)

In [17]:
#--- looking at output (before sigmoid) for the ingredients of recipe no. 0
#--- which were in recipe (the higher the value, the more the model recommens
#--- that particular ingredient)
n_recipe = 0
model(X_train_tensor)[n_recipe][X_train_tensor[n_recipe]==1]

tensor([5.8278, 3.6033, 6.4118, 2.4321, 7.6689, 4.5361, 4.6442, 4.5032, 3.8904,
        3.9517, 6.5478, 8.3725, 4.1392, 6.0618, 4.5452],
       grad_fn=<IndexBackward>)

In [18]:
from itertools import compress
#--- ingredients of first recipe
recipe1 = list(compress(ingredients, [X_train_tensor[n_recipe]==1][0]))
recipe1

['basil',
 'brown sugar',
 'butter',
 'dehydrated onion',
 'egg',
 'flour',
 'garlic',
 'Italian breadcrumbs',
 'lemon',
 'mozzarella',
 'oregano',
 'parmesan cheese',
 'pork',
 'tomato sauce',
 'water']

In [19]:
#--- 
#torch.argmax(model(X_train_tensor)[0])
#ingredients[[torch.topk(model(X_train_tensor)[0], 10).indices][0].numpy()]
#[torch.topk(model(X_train_tensor)[0], 10).indices][0].numpy()

#--- needed to convert ingredients to np.array
arr_ingredients = np.asarray(ingredients)
#--- top k recommendations (hopefully including all ingredients from recipe)
k = 40
topk_suggestions = arr_ingredients[[torch.topk(model(X_train_tensor)[n_recipe], k).indices][0].numpy()]
print("top k suggestions\n", topk_suggestions)
print("\ntop suggestions which are not in recipe\n", topk_suggestions[np.isin(topk_suggestions, recipe1)==False])
print("\ningredients in original recipe that do not appear in recommendations\n", 
      np.setdiff1d(recipe1, topk_suggestions))


top k suggestions
 ['parmesan cheese' 'egg' 'oregano' 'butter' 'tomato sauce' 'basil'
 'garlic' 'water' 'flour' 'Italian breadcrumbs' 'pork' 'mozzarella'
 'lemon' 'brown sugar' 'dehydrated onion' 'red food coloring' 'prosciutto'
 'eggplant' 'onion' 'cottage cheese' 'pesto' 'pepper' 'veal'
 'marinara sauce' 'beef' 'ricotta cheese' 'garlic powder' 'salt'
 'tomato paste' 'Italian salad dressing' 'olive oil' 'parsley'
 'white bread' 'breadcrumb' 'milk' 'pasta sauce' 'chicken' 'mushroom'
 'garlic salt' 'greens']

top suggestions which are not in recipe
 ['red food coloring' 'prosciutto' 'eggplant' 'onion' 'cottage cheese'
 'pesto' 'pepper' 'veal' 'marinara sauce' 'beef' 'ricotta cheese'
 'garlic powder' 'salt' 'tomato paste' 'Italian salad dressing'
 'olive oil' 'parsley' 'white bread' 'breadcrumb' 'milk' 'pasta sauce'
 'chicken' 'mushroom' 'garlic salt' 'greens']

ingredients in original recipe that do not appear in recommendations
 []


In [16]:
np.asarray(ingredients)[[torch.topk(model(X_train_tensor)[0], 1).indices][0].numpy()]

array(['parmesan cheese'], dtype='<U30')

In [None]:
[X_train_tensor[0]==1]

In [None]:
model(X_train_tensor)[0][X_train_tensor[0]==0]

In [None]:
model(X_val_tensor)[0][X_val_tensor[0]==0]

In [None]:
batch