In [None]:
import numpy as np
import sklearn 
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [None]:
data = pd.read_csv('data/processed_dishes_v3.csv')
data.head(5)

In [None]:
# convert to string
data['menu_section'] = data['menu_section'].values.astype('str')
data['dish_name'] = data['dish_name'].values.astype('str')
data['cleaned_descriptions'] = data['cleaned_descriptions'].values.astype('str')
data['full_description'] = data['full_description'].values.astype('str')

In [None]:
X = data['full_description']
Y = data.loc[:, 'contains_peanuts':'contains_meat']

## Text Representation

Most classifiers and learning algorithms require the input data to be in numerical format rather than strings. Therefore, using a measure called Term Frequency, Inverse Document Frequency (tf-idf), I will convert the strings into vectors of integers. I have chosen a `min_df` value of 5, which means that a word must be present at least 5 times to be kept. This will help us remove any necessary words, especially since we've included the dish name as part of the features, and some names may be more fun than informative. I have also chosen the `ngram_range` to be `(1, 2)`, indicating that we want unigrams and bigrams. This is because certain food phrases may be more than 1 word long, and capturing those phrases is equally as important.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(X).toarray()
tfidf_features.shape

## Using OneVsRest Classifier for Multi-Label Classification
Because our problem involves multi-label classification, one suggestion was to use the OneVsRest Classifier module. According to [this source](https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff):
> In an “one-to-rest” strategy, one could build multiple independent classifiers and, for an unseen instance, choose the class for which the confidence is maximized. The main assumption here is that the labels are mutually exclusive. You do not consider any underlying correlation between the classes in this method.

We need to use this because models like Logistic Regression and Naive Bayes only take in a 1-D array of labels. However, we have 7 different labels since we're essentially doing multi-label classification. 

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf_features, Y, random_state = 42)

In [None]:
categories = ['contains_peanuts', 'contains_egg', 'contains_sesame',
              'contains_fish', 'contains_shellfish', 'contains_soy',
              'contains_meat']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression())),
            ])
for category in categories:
    print('**Processing {}...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, y_train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    print("\n")

For our Naive Bayes model, I am using MultinomialNB because as stated in [this source](https://scikit-learn.org/stable/modules/naive_bayes.html): 
> MultinomialNB implements the naive Bayes algorithm for multinomially distributed data, and is one of the two classic naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice).

In [None]:
# creating pipeline for multinomial nb model
model_nb = MultinomialNB()
NB_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(model_nb)),
            ])
for category in categories:
    print('**Processing {}...**'.format(category))

    # Training naive bayes model on train data
    NB_pipeline.fit(x_train, y_train[category])
    
    # calculating test accuracy
    prediction = NB_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    print("\n")

In [None]:
# creating pipeline for svm model
model_svm = LinearSVC()
SVM_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(model_svm)),
            ])
for category in categories:
    print('**Processing {}...**'.format(category))

    # Training SVM model on train data
    SVM_pipeline.fit(x_train, y_train[category])
    
    # calculating test accuracy
    prediction = SVM_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    print("\n")

## Using Binary Relevance

In [None]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance

In [None]:
# logistic regression
classifier_lr = BinaryRelevance(LogisticRegression(random_state=42))
classifier_lr.fit(x_train, y_train)
predictions_br_lr = classifier_lr.predict(x_test)
print("Accuracy = ", accuracy_score(y_test, predictions_br_lr))

In [None]:
# naive bayes
classifier_nb = BinaryRelevance(MultinomialNB())
classifier_nb.fit(x_train, y_train)
predictions_br_nb = classifier_nb.predict(x_test)
print("Accuracy = ", accuracy_score(y_test, predictions_br_nb))

In [None]:
# svm
classifier_svm = BinaryRelevance(LinearSVC(random_state=42))
classifier_svm.fit(x_train, y_train)
predictions_br_svm = classifier_svm.predict(x_test)
print("Accuracy = ", accuracy_score(y_test, predictions_br_svm))

In [None]:
# ensembling
final_pred = []
for i in range(len(x_test)):
    pred1 = predictions_br_lr.toarray()[i]
    pred2 = predictions_br_nb.toarray()[i]
    pred3 = predictions_br_svm.toarray()[i]
    temp_pred = []
    for j in range(7):
        pred1_j = pred1[j]
        pred2_j = pred2[j]
        pred3_j = pred3[j]
        temp_pred.append(int(np.round(np.mean([pred1_j, pred2_j, pred3_j*2]))))
    final_pred.append(temp_pred)
print("Accuracy = ", accuracy_score(y_test, final_pred))

## Using KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neighbor_range = range(2, 31)
accuraries_neigh = []
for i in neighbor_range:
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(x_train, y_train)
    predictions_neigh = neigh.predict(x_test)
    print("Accuracy for", i, "=", accuracy_score(y_test, predictions_neigh))
    accuraries_neigh.append(accuracy_score(y_test, predictions_neigh))