In [None]:
import numpy as np
import sklearn 
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
data = pd.read_csv('data/processed_dishes_v3.csv')
data.head(5)

In [None]:
# convert to string
data['menu_section'] = data['menu_section'].values.astype('str')
data['dish_name'] = data['dish_name'].values.astype('str')
data['cleaned_descriptions'] = data['cleaned_descriptions'].values.astype('str')
data['full_description'] = data['full_description'].values.astype('str')

In [None]:
# features = data.loc[:, 'dish_id':'cleaned_descriptions']
features = data['full_description']
Y = data.loc[:, 'contains_peanuts':'contains_meat'].to_numpy()

In [None]:
# split_descriptions = features['cleaned_descriptions'].astype(str).apply(lambda s: s.split())
split_descriptions = features.astype(str).apply(lambda s: s.split())

In [None]:
s = set()
for d in split_descriptions:
    [s.add(w) for w in d]
print(f'Size of vocabulary: {len(s)}')

## Text Representation

Most classifiers and learning algorithms require the input data to be in numerical format rather than strings. Therefore, using a measure called Term Frequency, Inverse Document Frequency (tf-idf), I will convert the strings into vectors of integers. I have chosen a `min_df` value of 5, which means that a word must be present at least 5 times to be kept. This will help us remove any necessary words, especially since we've included the dish name as part of the features, and some names may be more fun than informative. I have also chosen the `ngram_range` to be `(1, 2)`, indicating that we want unigrams and bigrams. This is because certain food phrases may be more than 1 word long, and capturing those phrases is equally as important.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(features).toarray()
tfidf_features.shape

In [None]:
# different approach: Tokenizer
# from keras.preprocessing.text import Tokenizer

# description_only = features['cleaned_descriptions'].to_numpy()
# tokenizer = Tokenizer(num_words=5252)
# tokenizer.fit_on_texts(description_only)
# encoded_description = tokenizer.texts_to_matrix(description_only, mode='count')
# encoded_description

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(tfidf_features, Y, test_size=0.25, random_state = 42)

In [None]:
model = tf.keras.Sequential()
model.add(layers.Embedding(3587, 128))
model.add(layers.LSTM(128, dropout=0.2))
model.add(layers.Dense(7, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# X = tf.keras.preprocessing.sequence.pad_sequences(split_descriptions, dtype=object, padding='post', value=' ')
# X

In [None]:
model.fit(x_train, y_train, epochs=1, batch_size=128, shuffle=True, validation_data=(x_val, y_val), verbose=1)

In [None]:
test = ['chicken bacon ranch sauce mozzarella cheddar'] 
encoded_test = tokenizer.texts_to_matrix(test, mode='count')
model.predict(encoded_test)