In [23]:
import pickle
import sys
sys.path.append('../')
from pathlib import Path
import json
import math
from collections import OrderedDict
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score


In [3]:
with open('../meta/cuisines.json','r')as f:
    raw = f.read()
    data = json.loads(raw)

cuisines = data["cuisines"]
cuisines.sort()

In [4]:
file = open('../data/validation.sav', 'rb')
# dump information to that file
data = pickle.load(file)
# close the file
file.close()

In [5]:
X_test = data["ingredients"]
Y_test = data["cuisines"]
test_vectors = data["vectors"]

In [6]:
file = open('../data/train.sav', 'rb')
# dump information to that file
data = pickle.load(file)
# close the file
file.close()

In [7]:
X_train = data["ingredients"]
Y_train = data["cuisines"]
train_vecotrs = data["vectors"]

In [8]:
def get_ingredients(X,Y,cuisine):

    ingredients = []

    for x,y in zip(X,Y):
        if y == cuisine:
            ingredients.extend(x)

    ing_set = list(set(ingredients))
    result = {ing:ingredients.count(ing) for ing in ing_set}
    result = dict(sorted(result.items(), key=lambda item: item[1],reverse=True))

    return result

In [9]:
def get_top_ingredients(X,Y,cuisine,i):
    result = list(get_ingredients(X,Y,cuisine).keys())
    
    _result = []
    j = 0
    while j < i and j < len(result):
        _result.append(result[j])

        j+= 1

    return _result 

In [10]:
def get_train_test_match(X_train,Y_train,X_test,Y_test,cuisine,top):
    train_result = get_top_ingredients(X_train,Y_train,cuisine,top)
    test_result = get_top_ingredients(X_test,Y_test,cuisine,top)
    train_set = list(set(train_result))
    test_set = list(set(test_result))

    i = 0
    for elem in train_set:
        if elem in test_set:
            i += 1
    if i > 0:
        return i/len(test_set)
    else:
        return 0


Data analysis
-----------

In [11]:
top = 10
tot = 0
for cuisine in cuisines:
    p = get_train_test_match(X_train,Y_train,X_test,Y_test,cuisine,top)
    tot += p
    print(cuisine,p)


print(tot/len(cuisines))

brazilian 0.1
british 0.4
cajun_creole 0
chinese 0.6
filipino 0.5
french 0.6
greek 0.7
indian 0.4
irish 0.7
italian 0.8
jamaican 0
japanese 0.6
korean 0.5
mexican 0.8
moroccan 0.5
russian 0.7
southern_us 0
spanish 0.6
thai 0.8
vietnamese 0.6
0.495


The percentage match between top 10 ingredients of a cuisine between train and test set 
-----

brazilian 0.1 <br>
british 0.4<br>
cajun_creole 0<br>
chinese 0.6<br>
filipino 0.5<br>
french 0.6<br>
greek 0.8<br>
indian 0.4<br>
irish 0.8<br>
italian 0.8<br>
jamaican 0<br>
japanese 0.5<br>
korean 0.7<br>
mexican 0.8<br>
moroccan 0.4<br>
russian 0.7<br>
southern_us 0<br>
spanish 0.6<br>
thai 0.8<br>
vietnamese 0.5<br>

average match = 0.5<br>

In [12]:
def tf(word,doc):
    count = doc.count(word)
    return count / len(doc)

def df(word,corpus):
    #document frequency
    count = 0
    for doc in corpus:
        if word in doc:
            count += 1
    return count/len(corpus)

def idf(word,corpus):
    return math.log(len(corpus)/(df(word,corpus)+1))

def td_idf(word,doc,corpus):
    return tf(word,doc)*idf(word,corpus)



In [13]:
# doc = X[93]
# print(doc)
# word = doc[5]
# word = "salt"

# scores = []

# valid = []
# invalid = []
# for word in doc:
#     score = df(word,X)
#     if score < 0.09613523566543522:
#         valid.append((word,score))
#     else: 
#         invalid.append((word,score))
#     scores.append(score)

# for r in valid:
#     print(r)

# print(' ')

# for r in invalid:
#     print(r)



# for doc in X:
#     for word in doc:
#         score = df(word,X)
#         scores.append(score)

# scores = np.array(scores)
# print(scores.mean())


0.09613523566543522

In [14]:
result = get_ingredients(X_train,Y_train,'italian')
for r in result:
    print(f'{r}: {result[r]}')

olive oil: 4477
salt: 4247
parmesan cheese: 2344
black pepper: 2146
garlic clove: 1912
garlic: 1905
onion: 1732
tomato: 1698
butter: 1626
water: 1440
basil: 1388
egg: 1269
flour: 1087
pepper: 1052
mozzarella cheese: 1023
oregano: 953
parsley: 949
sugar: 835
white wine: 834
lemon juice: 687
red pepper: 615
leaf parsley: 589
milk: 567
basil leave: 532
chicken breast: 498
cook spray: 491
cream: 467
mushroom: 466
red pepper flake: 461
ricotta cheese: 453
rosemary: 445
carrot: 418
beef: 414
thyme: 411
pasta: 384
tomato paste: 380
bread crumb: 374
tomato sauce: 367
spinach: 364
balsamic vinegar: 357
red bell pepper: 356
purple onion: 350
italian season: 348
plum tomato: 340
zucchini: 332
freshly pepper: 316
shallot: 316
parmigiano reggiano cheese: 306
caper: 306
spaghetti: 303
sea salt: 297
celery: 273
chicken broth: 267
lasagna noodle: 265
italian sausage: 263
arborio rice: 261
prosciutto: 257
pasta sauce: 257
vegetable oil: 253
pinenut: 252
garlic powder: 252
vanilla extract: 241
lemon: 24

In [15]:
class NeurelNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(NeurelNetwork, self).__init__()

        self.l1 = nn.Linear(input_size, 150)
        self.l2 = nn.Linear(150,125)
        self.l3 = nn.Linear(125,125)
        self.l4 = nn.Linear(125, output_size)

    def forward(self, X):
        out = torch.rrelu(self.l1(X))
        out = torch.sigmoid(self.l2(out))
        out = torch.rrelu(self.l3(out))
        out = torch.rrelu(self.l4(out))

        return out

In [19]:
INVALID = True
NN_PATH = f'../models/NeuralNetwork.pth'
if INVALID:
    NN_PATH = f'../models/NeuralNetwork_with_invalid.pth'
    cuisines.append('invalid')


In [22]:
n_features = len(train_vecotrs[0])
n_outputs = len(cuisines)
model = NeurelNetwork(n_features, n_outputs)

model = NeurelNetwork(n_features, n_outputs)
model.load_state_dict(torch.load(NN_PATH))
model.eval()
loss = nn.CrossEntropyLoss()


In [29]:
X = test_vectors
Y = Y_test
Y = [cuisines.index(y) for y in Y]


In [35]:
X = torch.tensor(X,dtype=torch.float32)
Y = torch.tensor(Y)

Y_pred = model(X)

l = loss(Y_pred, Y)

_, predicted = torch.max(Y_pred.data, 1)

score = accuracy_score(predicted, Y)

sm = torch.nn.Softmax()
probabilities = sm(Y_pred) 

i = 0
tot = 0
for prob,y in zip(probabilities,Y):
    if y == cuisines.index("invalid"):
        # print(prob)
        # print(max(prob))
        # print('\n')
        tot += max(prob)
        i += 1

print(tot/i)


  X = torch.tensor(X,dtype=torch.float32)
  Y = torch.tensor(Y)
  probabilities = sm(Y_pred)


tensor([7.6895e-14, 1.1123e-13, 5.6370e-12, 3.3833e-15, 6.1350e-15, 9.9966e-01,
        2.4797e-04, 2.7898e-12, 8.2846e-14, 7.9189e-10, 1.5932e-09, 8.8702e-13,
        3.4725e-13, 5.9139e-13, 5.4586e-13, 8.5552e-05, 9.1176e-06, 3.4567e-09,
        1.3177e-09, 2.5649e-15, 2.9491e-14], grad_fn=<UnbindBackward>)
tensor(0.9997, grad_fn=<UnbindBackward>)


tensor([6.3461e-22, 8.7305e-11, 6.7404e-16, 2.5673e-23, 2.8215e-23, 9.9960e-01,
        9.5689e-21, 1.7830e-23, 9.4804e-23, 4.0454e-04, 2.1765e-15, 1.6369e-21,
        8.7935e-22, 7.8983e-22, 3.2412e-18, 1.2896e-21, 1.0398e-14, 3.8001e-08,
        5.4572e-12, 2.5739e-24, 9.0819e-24], grad_fn=<UnbindBackward>)
tensor(0.9996, grad_fn=<UnbindBackward>)


tensor([2.3816e-06, 1.2973e-06, 1.9328e-06, 1.4258e-06, 7.0156e-07, 9.7408e-01,
        8.8356e-06, 2.2567e-06, 2.1415e-06, 3.6363e-05, 2.5721e-02, 1.5451e-06,
        4.1830e-06, 2.2863e-06, 9.8109e-06, 4.0517e-06, 4.3199e-06, 1.1593e-05,
        1.0351e-04, 9.1995e-07, 4.0458e-06], grad_fn