In [None]:
import numpy as np
import sklearn 
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# data = pd.read_csv('data/processed_dishes_v3.csv')
data = pd.read_csv('data/processed_dishes_v4.csv')
data.head(5)

In [None]:
# convert to string
data['menu_section'] = data['menu_section'].values.astype('str')
data['dish_name'] = data['dish_name'].values.astype('str')
data['cleaned_descriptions'] = data['cleaned_descriptions'].values.astype('str')
data['full_description'] = data['full_description'].values.astype('str')

In [None]:
X = data['full_description']
Y = data.loc[:, 'contains_peanuts':'contains_meat']

## Text Representation

Most classifiers and learning algorithms require the input data to be in numerical format rather than strings. Therefore, using a measure called Term Frequency, Inverse Document Frequency (tf-idf), I will convert the strings into vectors of integers. I have chosen a `min_df` value of 5, which means that a word must be present at least 5 times to be kept. This will help us remove any necessary words, especially since we've included the dish name as part of the features, and some names may be more fun than informative. I have also chosen the `ngram_range` to be `(1, 2)`, indicating that we want unigrams and bigrams. This is because certain food phrases may be more than 1 word long, and capturing those phrases is equally as important.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(X).toarray()
tfidf_features.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf_features, Y, random_state = 42)

In [None]:
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)
predictions= clf.predict(x_test)
accuracy_score(y_test, predictions)

## Hyperparameter Tuning
There are multiple hyperparameters that can be tuned, but I will be focusing on these:
* `max_depth`: maximum depth of each tree
* `n_estimators`: specifies the number of trees in the forest of the model
* `min_samples_split`: the minimum number of samples required to split an internal leaf node
* `min_samples_leaf`: the minimum number of samples required to be at a leaf node

In [None]:
from sklearn.model_selection import validation_curve

In [None]:
depth_range_2 = range(15, 76, 5)
train_score_depth2, test_score_depth2 = validation_curve(
                                RandomForestClassifier(random_state=42),
                                X = x_train, y = y_train, 
                                param_name = 'max_depth', 
                                param_range = depth_range_2,
                                n_jobs=-1, verbose=2)

In [None]:
# Calculating mean of training and test scores
mean_train_score_depth2 = np.mean(train_score_depth2, axis = 1) 
mean_test_score_depth2 = np.mean(test_score_depth2, axis = 1) 
  
# Plot mean accuracy scores for training and testing scores 
plt.plot(depth_range_2, mean_train_score_depth2,  
     label = "Training Score", color = 'b') 
plt.plot(depth_range_2, mean_test_score_depth2, 
   label = "Cross Validation Score", color = 'g') 
  
# Creating the plot 
plt.title("Validation Curve with Random Forest")
plt.xlabel("Max Depth") 
plt.ylabel("Accuracy") 
plt.tight_layout() 
plt.legend(loc = 'best') 
plt.show()

In [None]:
num_trees_range = range(100, 1001, 50)
train_score_trees, test_score_trees = validation_curve(
                                RandomForestClassifier(),
                                X = x_train, y = y_train, 
                                param_name = 'n_estimators', 
                                param_range = num_trees_range, 
                                n_jobs=-1, verbose=2)

In [None]:
# Calculating mean of training and test scores
mean_train_score_trees = np.mean(train_score_trees, axis = 1) 
mean_test_score_trees = np.mean(test_score_trees, axis = 1) 
  
# Plot mean accuracy scores for training and testing scores 
plt.plot(num_trees_range, mean_train_score_trees,  
     label = "Training Score", color = 'b') 
plt.plot(num_trees_range, mean_test_score_trees, 
   label = "Cross Validation Score", color = 'g') 
  
# Creating the plot 
plt.title("Validation Curve with Random Forest")
plt.xlabel("Num of Trees (n_estimators)") 
plt.ylabel("Accuracy") 
plt.tight_layout() 
plt.legend(loc = 'best') 
plt.show()

In [None]:
num_trees_range[np.argmax(mean_test_score_trees)]

In [None]:
min_samples_range = range(2, 11)
train_score_split, test_score_split = validation_curve(
                                RandomForestClassifier(),
                                X = x_train, y = y_train, 
                                param_name = 'min_samples_split', 
                                param_range = min_samples_range, verbose=1)

In [None]:
# Calculating mean of training and test scores
mean_train_score_split = np.mean(train_score_split, axis = 1) 
mean_test_score_split = np.mean(test_score_split, axis = 1) 
  
# Plot mean accuracy scores for training and testing scores 
plt.plot(min_samples_range, mean_train_score_split,  
     label = "Training Score", color = 'b') 
plt.plot(min_samples_range, mean_test_score_split, 
   label = "Cross Validation Score", color = 'g') 
  
# Creating the plot 
plt.title("Validation Curve with Random Forest")
plt.xlabel("Minimum Samples to Split an Internal Leaf Node (min_samples_split)") 
plt.ylabel("Accuracy") 
plt.tight_layout() 
plt.legend(loc = 'best') 
plt.show()

In [None]:
min_samples_range[np.argmax(mean_test_score_split)]

In [None]:
train_score_leaf, test_score_leaf = validation_curve(
                                RandomForestClassifier(),
                                X = x_train, y = y_train, 
                                param_name = 'min_samples_leaf', 
                                param_range = min_samples_range, 
                                n_jobs=-1, verbose=2)

In [None]:
# Calculating mean of training and test scores
mean_train_score_leaf = np.mean(train_score_leaf, axis = 1) 
mean_test_score_leaf = np.mean(test_score_leaf, axis = 1) 
  
# Plot mean accuracy scores for training and testing scores 
plt.plot(min_samples_range, mean_train_score_leaf,  
     label = "Training Score", color = 'b') 
plt.plot(min_samples_range, mean_test_score_leaf, 
   label = "Cross Validation Score", color = 'g') 
  
# Creating the plot 
plt.title("Validation Curve with Random Forest")
plt.xlabel("Minimum Samples at Leaf Node (min_samples_leaf)") 
plt.ylabel("Accuracy") 
plt.tight_layout() 
plt.legend(loc = 'best') 
plt.show()

## Building the Random Forest Model
With my hyperparameters, I will now try to build our model and see how it does on the test set. 

In [None]:
clf = RandomForestClassifier(random_state=42, max_depth=55, n_estimators=900, 
                             min_samples_split=4, min_samples_leaf=2)
clf.fit(x_train, y_train)
predictions= clf.predict(x_test)
accuracy_score(y_test, predictions)

## Trying Random Forest on Individual Labels

In [None]:
def get_train_test_for_ingredient(ingredient):
    ing_Y = data[f'contains_{ingredient}'].to_numpy()
    X_train, X_test, Y_train, Y_test = train_test_split(tfidf_features, ing_Y, test_size=0.25, random_state=42)
    return X_train, X_test, Y_train, Y_test

In [None]:
def get_RF_results(ingredient, n_estimators=100, min_samples_split=2, min_samples_leaf=2):
    print(f'Creating model to predict contains_{ingredient}...')
    X_train, X_test, Y_train, Y_test = get_train_test_for_ingredient(ingredient)
    clf = RandomForestClassifier(random_state=42, n_estimators=n_estimators,
                                 min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_leaf, verbose=2, n_jobs=-1)
    clf.fit(X_train, Y_train)
    
    print(f'Validating model to predict contains_{ingredient}...')
    predictions = clf.predict(X_test)
    
    return accuracy_score(Y_test, predictions)

In [None]:
ingredients = ['peanuts', 'egg', 'sesame', 'fish', 'shellfish', 'soy', 'meat']
accuracies = []
for ingredient in ingredients:
    accuracies.append(get_RF_results(ingredient))

In [None]:
print(accuracies)

## Tuning Hyperparameters for Individual Random Forest Classifiers

In [None]:
def tune_hyperparam(param, values, x_train, x_test, y_train, y_test):
    train_score, test_score = validation_curve(
                                RandomForestClassifier(random_state=42),
                                X = x_train, y = y_train, 
                                param_name = param, 
                                param_range = values,
                                n_jobs=-1, verbose=2)
    # Calculating mean of training and test scores
    mean_train_score = np.mean(train_score, axis = 1) 
    mean_test_score = np.mean(test_score, axis = 1) 

    print("Highest value occurred at ", np.argmax(test_score))
    
    # Plot mean accuracy scores for training and testing scores 
    plt.plot(values, mean_train_score,  
         label = "Training Score", color = 'b') 
    plt.plot(values, mean_test_score, 
       label = "Cross Validation Score", color = 'g') 

    # Creating the plot 
    plt.title("Validation Curve with Random Forest")
    plt.xlabel(param) 
    plt.ylabel("Accuracy") 
    plt.tight_layout() 
    plt.legend(loc = 'best') 
    plt.show()

In [None]:
max_depth_range = [15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100]
n_estimators_range = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
min_samples_split_range = [2, 4, 6, 8, 10, 12]
min_samples_leaf_range = [2, 3, 4, 5]

### First Up: Meat!

In [None]:
# 1) meat
x_peanuts_train, x_peanuts_test, y_peanuts_train, y_peanuts_test = get_train_test_for_ingredient('meat')
tune_hyperparam('max_depth', max_depth_range, x_peanuts_train, x_peanuts_test, y_peanuts_train, y_peanuts_test)

In [None]:
tune_hyperparam('n_estimators', n_estimators_range, x_peanuts_train, x_peanuts_test, y_peanuts_train, y_peanuts_test)

In [None]:
tune_hyperparam('min_samples_split', min_samples_split_range, x_peanuts_train, x_peanuts_test, y_peanuts_train, y_peanuts_test)

In [None]:
tune_hyperparam('min_samples_leaf', min_samples_leaf_range, x_peanuts_train, x_peanuts_test, y_peanuts_train, y_peanuts_test)

In [None]:
meat_accuracies = get_RF_results('meat', n_estimators=500, min_samples_split=9, min_samples_leaf=2)

In [None]:
print(meat_accuracies)

In [None]:
# 2) soy
x_soy_train, x_soy_test, y_soy_train, y_soy_test = get_train_test_for_ingredient('soy')

In [None]:
tune_hyperparam('n_estimators', n_estimators_range, x_soy_train, x_soy_test, y_soy_train, y_soy_test)