In [None]:
import numpy as np
import sklearn 
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVR, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('data/processed_dishes_v4.csv')
data.head(5)

In [None]:
# convert to string
data['menu_section'] = data['menu_section'].values.astype('str')
data['dish_name'] = data['dish_name'].values.astype('str')
data['cleaned_descriptions'] = data['cleaned_descriptions'].values.astype('str')
data['full_description'] = data['full_description'].values.astype('str')

In [None]:
X = data['full_description']
Y = data.loc[:, 'contains_peanuts':'contains_meat']

## Text Representation

Most classifiers and learning algorithms require the input data to be in numerical format rather than strings. Therefore, using a measure called Term Frequency, Inverse Document Frequency (tf-idf), I will convert the strings into vectors of integers. I have chosen a `min_df` value of 5, which means that a word must be present at least 5 times to be kept. This will help us remove any necessary words, especially since we've included the dish name as part of the features, and some names may be more fun than informative. I have also chosen the `ngram_range` to be `(1, 2)`, indicating that we want unigrams and bigrams. This is because certain food phrases may be more than 1 word long, and capturing those phrases is equally as important.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(X).toarray()
tfidf_features.shape

## Generating Models

In [None]:
def get_train_test_for_ingredient(ingredient):
    ing_Y = data[f'contains_{ingredient}'].to_numpy()
    X_train, X_test, Y_train, Y_test = train_test_split(tfidf_features, ing_Y, test_size=0.25, random_state=42)
    return X_train, X_test, Y_train, Y_test

In [None]:
def get_SVM_predictions(ingredient): 
    print(f'Creating model to predict contains_{ingredient}...')
    X_train, X_test, Y_train, Y_test = get_train_test_for_ingredient(ingredient)
    clf = LinearSVR(random_state=42, loss='squared_epsilon_insensitive')
#     clf = SVR(kernel='linear')
    clf.fit(X_train, Y_train)
    
    print(f'Validating model to predict contains_{ingredient}...')
    predictions = clf.predict(X_test)
    prediction_classes = predictions > 0.5
    accuracy = accuracy_score(Y_test, prediction_classes)
    return clf, predictions, accuracy

In [None]:
ingredients = ['peanuts', 'egg', 'sesame', 'fish', 'shellfish', 'soy', 'meat']
accuracies_SVM = []
models_SVM = []
predictions_SVM = []
for ingredient in ingredients:
    model, predictions, accuracy = get_SVM_predictions(ingredient)
    models_SVM.append(model)
    predictions_SVM.append(predictions)
    accuracies_SVM.append(accuracy)

In [None]:
def get_NN_predictions(ingredient, epochs=5):
    print(f'Creating model to predict contains_{ingredient}...')
    X_train, X_test, Y_train, Y_test = get_train_test_for_ingredient(ingredient)
    
    model = tf.keras.Sequential([
        layers.Dense(5787, input_shape=(11574,), activation='relu'),
        layers.Dense(256),
        layers.Dropout(0.1),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
        loss=tf.keras.losses.BinaryCrossentropy(), 
        metrics=['accuracy', tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()]
    )
    
    model.fit(
        X_train,
        Y_train,
        epochs = epochs,
        batch_size=128,
        shuffle=True,
        verbose=0
    )
    
    model.save("nn/" + ingredient)
    
    print(f'Validating model to predict contains_{ingredient}...')
    
    predictions = model.predict(X_test)
    prediction_classes = (predictions > 0.5).astype("int32")
    accuracy = accuracy_score(Y_test, prediction_classes)

    return model, predictions, accuracy

In [None]:
# method 1: generates nn models (can save models if you uncomment the model.save line)
models_nn = []
accuracies_nn = []
predictions_nn = []
for ingredient in ingredients:
    model, predictions, accuracy = get_NN_predictions(ingredient, epochs=10)
    models_nn.append(model)
    predictions_nn.append(predictions)
    accuracies_nn.append(accuracy)

In [None]:
get_NN_predictions('meat', epochs=5)

In [None]:
# method 2: if models are saved, this loads them from saved location  
models_nn = []
accuracies_nn = []
predictions_nn = []
for ingredient in ingredients:
    print('loading ingredient ', ingredient)
    X_train, X_test, Y_train, Y_test = get_train_test_for_ingredient(ingredient)
    model = tf.keras.models.load_model('nn/' + ingredient)
    models_nn.append(model)
    predictions = model.predict(X_test)
    prediction_classes = (predictions > 0.5).astype("int32")
    accuracy = accuracy_score(Y_test, prediction_classes)
    predictions_nn.append(predictions)
    accuracies_nn.append(accuracy)

In [None]:
def get_RF_predictions(ingredient):
    print(f'Creating model to predict contains_{ingredient}...')
    X_train, X_test, Y_train, Y_test = get_train_test_for_ingredient(ingredient)
    clf = RandomForestClassifier(random_state=42, n_jobs=-1)
    clf.fit(X_train, Y_train)
    
    print(f'Validating model to predict contains_{ingredient}...')
    prediction_classes = clf.predict(X_test)
    predictions = clf.predict_proba(X_test)
    accuracy = accuracy_score(Y_test, prediction_classes)
    
    return clf, predictions, accuracy

In [None]:
accuracies_rf = []
models_rf = []
predictions_rf = []
for ingredient in ingredients:
    model, predictions, accuracy = get_RF_predictions(ingredient)
    models_rf.append(model)
    predictions_rf.append(predictions)
    accuracies_rf.append(accuracy)

In [None]:
print("svm: ", accuracies_SVM)
print("neural network: ", accuracies_nn)
print("random forest: ", accuracies_rf)

## Ensembling Time!

In [None]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

sigmoid_v = np.vectorize(sigmoid)

In [None]:
final_predictions = []
predictions_SVM[0]
for i in range(7): 
    single_prediction = []
    for j in range(5000):
        mean_pred = (sigmoid_v(predictions_SVM[i][j]) + predictions_nn[i][j] + predictions_rf[i][j][1]) / 3
        single_prediction.extend(mean_pred)
    final_predictions.append(single_prediction)
final_predictions = (np.array(final_predictions) > 0.5).astype("int32")

In [None]:
def get_final_stats(i, ingredient):
    X_train, X_test, Y_train, Y_test = get_train_test_for_ingredient(ingredient)
    accuracy = accuracy_score(Y_test, final_predictions[i])
    print('accuracy =', accuracy)
    classification_rep = classification_report(Y_test, final_predictions[i])
    print(classification_rep)
    CM = confusion_matrix(Y_test, final_predictions[i])
    return accuracy, CM[1][0], CM[0][1]

In [None]:
final_accuracies = []
final_fn = []
final_fp = []

In [None]:
# peanuts
accuracy_peanuts, fn_peanuts, fp_peanuts = get_final_stats(0, 'peanuts')
final_accuracies.append(accuracy_peanuts)
final_fn.append(fn_peanuts)
final_fp.append(fp_peanuts)

In [None]:
# egg
accuracy_soy, fn_soy, fp_soy = get_final_stats(1, 'egg')
final_accuracies.append(accuracy_soy)
final_fn.append(fn_soy)
final_fp.append(fp_soy)

In [None]:
# sesame
accuracy_sesame, fn_sesame, fp_sesame = get_final_stats(2, 'sesame')
final_accuracies.append(accuracy_sesame)
final_fn.append(fn_sesame)
final_fp.append(fp_sesame)

In [None]:
# fish
accuracy_fish, fn_fish, fp_fish = get_final_stats(3, 'fish')
final_accuracies.append(accuracy_fish)
final_fn.append(fn_fish)
final_fp.append(fp_fish)

In [None]:
# shellfish
accuracy_shellfish, fn_shellfish, fp_shellfish = get_final_stats(4, 'shellfish')
final_accuracies.append(accuracy_shellfish)
final_fn.append(fn_shellfish)
final_fp.append(fp_shellfish)

In [None]:
# soy
accuracy_soy, fn_soy, fp_soy = get_final_stats(5, 'soy')
final_accuracies.append(accuracy_soy)
final_fn.append(fn_soy)
final_fp.append(fp_soy)

In [None]:
# meat
accuracy_meat, fn_meat, fp_meat = get_final_stats(6, 'meat')
final_accuracies.append(accuracy_meat)
final_fn.append(fn_meat)
final_fp.append(fp_meat)

In [None]:
results = pd.DataFrame({
    'Ingredient': ingredients,
    'Validation Accuracy': final_accuracies,
    'Validation False Positives': final_fp, 
    'Validation False Negatives': final_fn
})

In [None]:
results