In [None]:
import numpy as np
import sklearn 
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('data/processed_dishes_v4.csv')
data.head()

In [None]:
# convert to string
data['menu_section'] = data['menu_section'].values.astype('str')
data['dish_name'] = data['dish_name'].values.astype('str')
data['cleaned_descriptions'] = data['cleaned_descriptions'].values.astype('str')
data['full_description'] = data['full_description'].values.astype('str')

In [None]:
# features = data.loc[:, 'dish_id':'cleaned_descriptions']
features = data['full_description']
Y = data.loc[:, 'contains_peanuts':'contains_meat'].to_numpy()

In [None]:
split_descriptions = features.astype(str).apply(lambda s: s.split())

In [None]:
s = set()
for d in split_descriptions:
    [s.add(w) for w in d]
print(f'Size of vocabulary: {len(s)}')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(features).toarray()
tfidf_features.shape

## Predicting Peanuts (Don't need to run anything here, see below)

In [None]:
peanut_Y = data['contains_peanuts'].to_numpy()

In [None]:
peanut_X_train, peanut_X_test, peanut_Y_train, peanut_Y_test = train_test_split(tfidf_features, peanut_Y, test_size=0.25, random_state=42)

In [None]:
[print(x.shape) for x in [peanut_X_train, peanut_Y_train, peanut_X_test, peanut_Y_test]];

In [None]:
peanut_model = tf.keras.Sequential([
    layers.Dense(5787, input_shape=(11574,), activation='relu'),
    layers.Dense(256),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
peanut_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
    loss=tf.keras.losses.BinaryCrossentropy(), 
    metrics=['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()]
)

In [None]:
peanut_model.summary()

In [None]:
peanut_model.fit(
    peanut_X_train,
    peanut_Y_train,
    epochs = 5,
    batch_size=128,
    shuffle=True
)

In [None]:
peanut_model.evaluate(
    peanut_X_test, 
    peanut_Y_test
)

In [None]:
# tf.keras.models.save_model(peanut_model, '..\..\RNNs\peanut', save_format='tf', overwrite=False)

In [None]:
del peanut_X_test
del peanut_Y_test
del peanut_X_train
del peanut_Y_train
del peanut_model

## Predicting Fish (Don't need to run anything here, see below)

In [None]:
fish_Y = data['contains_fish'].to_numpy()

In [None]:
fish_X_train, fish_X_test, fish_Y_train, fish_Y_test = train_test_split(tfidf_features, fish_Y, test_size=0.25, random_state=42)

In [None]:
fish_model = tf.keras.Sequential([
    layers.Dense(5787, input_shape=(11574,), activation='relu'),
    layers.Dense(256),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
fish_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
    loss=tf.keras.losses.BinaryCrossentropy(), 
    metrics=['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()]
)

In [None]:
fish_model.fit(
    fish_X_train,
    fish_Y_train,
    epochs = 5,
    batch_size=128,
    shuffle=True
)

In [None]:
fish_model.evaluate(
    fish_X_test, 
    fish_Y_test
)


In [None]:
del fish_X_test
del fish_Y_test
del fish_X_train
del fish_Y_train
del fish_model

## Generalizing Predictions ... Does it work on everything?

In [None]:
def get_NN_results(ingredient, epochs=5):
    print(f'Creating model to predict contains_{ingredient}...')
    ing_Y = data[f'contains_{ingredient}'].to_numpy()
    X_train, X_test, Y_train, Y_test = train_test_split(tfidf_features, ing_Y, test_size=0.25, random_state=42)
    
    model = tf.keras.Sequential([
        layers.Dense(5787, input_shape=(11574,), activation='relu'),
        layers.Dense(256),
        layers.Dropout(0.1),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
        loss=tf.keras.losses.BinaryCrossentropy(), 
        metrics=['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()]
    )
    
    model.fit(
        X_train,
        Y_train,
        epochs = epochs,
        batch_size=128,
        shuffle=True,
        verbose=0
    )
    
    print(f'Validating model to predict contains_{ingredient}...')
    
    val_results = model.evaluate(X_test, Y_test, verbose=0)
    
    del X_train
    del X_test
    del Y_train
    del Y_test
    del model
    del ing_Y
    
    return val_results

In [None]:
ingredients = ['peanuts', 'egg', 'sesame', 'fish', 'shellfish', 'soy', 'meat']
losses = []
accuracies = []
fps = []
fns = []

for ingredient in ingredients:
    stats = get_NN_results(ingredient, epochs=10) if ingredient in ['egg', 'soy', ]
    losses.append(stats[0])
    accuracies.append(stats[1])
    fps.append(stats[2])
    fns.append(stats[3])
    
results = pd.DataFrame({
    'Ingredient': ingredients,
    'Validation BinaryCrossEntropy Loss': losses,
    'Validation Accuracy': accuracies,
    'Validation False Positives': fps, 
    'Validation False Negatives': fns
})

In [None]:
results

## Improvements for Egg, Soy, and Meat

In [None]:
ing_Y = data[f'contains_meat'].to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(tfidf_features, ing_Y, test_size=0.25, random_state=42)

model = tf.keras.Sequential([
    layers.Dense(5787, input_shape=(11574,), activation='relu'),
    layers.Dense(2056),
    layers.Dense(256),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
    loss=tf.keras.losses.BinaryCrossentropy(), 
    metrics=['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()]
)

In [None]:
model.fit(
    X_train,
    Y_train,
    epochs = 3,
    batch_size=64,
    shuffle=True,
    verbose=1
)

In [None]:
val_results = model.evaluate(X_test, Y_test)