In [91]:
import numpy as np
import sklearn 
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras import layers

In [92]:
data = pd.read_csv('data/processed_dishes_v2.csv')
data.head(5)

Unnamed: 0,dish_id,menu_section,dish_name,menu_description,cleaned_descriptions,contains_peanuts,contains_egg,contains_sesame,contains_fish,contains_shellfish,contains_soy,contains_meat
0,8c310c8c-2461-4360-b10e-d21f331d8a4f,Gluten Free Specialty Pies,Chicken Bacon Ranch,"chicken, bacon, ranch sauce, mozzarella and ch...",chicken bacon ranch sauce mozzarella cheddar,0,0,0,0,0,0,1
1,2aba477b-461d-48b9-86ab-4a488bc39940,Wraps & Roll-Ups,Chinatown Chicken Wrap,tender chicken pieces simmered in oriental hon...,tender chicken pieces simmered oriental honey ...,0,0,0,0,0,1,1
2,54678a69-b974-4391-8933-911a851351aa,Pratos Tradicionais (Traditional Dishes),Katchupa Ref (( small )),refried katchupa with eggs and linguica.,refried katchupa eggs linguica,0,1,0,0,0,0,1
3,6cb8a4f5-5ed5-4ebd-a8a9-5870a9127cf5,Club Sandwiches,Ham & Cheese Club,"served with mayo, lettuce, tomato, pickles and...",served mayo lettuce tomato pickles bacon,0,0,0,0,0,0,1
4,881bf589-645a-426b-a43e-09002cdbbb3e,Noodles (Kitchen Entrée),Yaki Udon or Soba,pan fried udon noodles or wheat flour noodles ...,pan fried udon noodles wheat flour noodles cho...,0,0,1,0,0,1,1


In [94]:
# convert to string
data['menu_section'] = data['menu_section'].values.astype('str')
data['dish_name'] = data['dish_name'].values.astype('str')
data['cleaned_descriptions'] = data['cleaned_descriptions'].values.astype('str')

## ignore

In [34]:
X = data['cleaned_descriptions'] # description only
Y = data['contains_peanuts'].to_numpy()

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(description_only).toarray()
features.shape

(20000, 47694)

## end ignore

In [95]:
features = data.loc[:, 'dish_id':'cleaned_descriptions']
Y = data.loc[:, 'contains_peanuts':'contains_meat'].to_numpy()

In [96]:
split_descriptions = features['cleaned_descriptions'].astype(str).apply(lambda s: s.split())

In [97]:
s = set()
for d in split_descriptions:
    [s.add(w) for w in d]
print(f'Size of vocabulary: {len(s)}')

Size of vocabulary: 3587


In [98]:
from keras.preprocessing.text import Tokenizer

description_only = features['cleaned_descriptions'].to_numpy()
tokenizer = Tokenizer(num_words=5252)
tokenizer.fit_on_texts(description_only)
encoded_description = tokenizer.texts_to_matrix(description_only, mode='count')
encoded_description

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [100]:
model = tf.keras.Sequential()
model.add(layers.Embedding(5252, 128))
model.add(layers.LSTM(128))
model.add(layers.Dense(7, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [101]:
# X = tf.keras.preprocessing.sequence.pad_sequences(split_descriptions, dtype=object, padding='post', value=' ')
# X

In [103]:
model.fit(encoded_description, Y, verbose=1)



<tensorflow.python.keras.callbacks.History at 0x7f84eaa38550>

In [104]:
model.save('')

INFO:tensorflow:Assets written to: assets


In [105]:
test = ['chicken bacon ranch sauce mozzarella cheddar'] 
encoded_test = tokenizer.texts_to_matrix(test, mode='count')
model.predict(encoded_test)

array([[0.05101454, 0.28925067, 0.13817862, 0.0783166 , 0.11783832,
        0.25789842, 0.9949615 ]], dtype=float32)