## Random forest with full ingredients

This file is using the code provided here by Siraj Raval. 
https://github.com/llSourcell/random_forests/blob/master/Random%20Forests%20.ipynb

Applying random forest, using ingredients as strings not single words.

this is work in progress!!!

In [1]:
# Random Forest Algorithm
#This module implements pseudo-random number generators for various distributions.
#seeding the generated number makes our results reproducible (good for debugging)
from random import seed
#Return a randomly selected element from range(start, stop, step). 
from random import randrange
#read CSV file (dataset)
from csv import reader
#square root function
from math import sqrt

### Data Loading Helper Functions

In [2]:
# Load a CSV file# Load a 
def load_csv(filename):
    #init the dataset as a list
	dataset = list()
    #open it as a readable file
	with open(filename, 'r') as file:
        #init the csv reader
		csv_reader = reader(file)
        #for every row in the dataset
		for row in csv_reader:
			if not row:
				continue
            #add that row as an element in our dataset list (2D Matrix of values)
			dataset.append(row)
    #return in-memory data matrix
	return dataset
 
# Convert string column to float
def str_column_to_float(dataset, column):
    #iterate throw all the rows in our data matrix
	for row in dataset:
        #for the given column index, convert all values in that column to floats
		row[column] = float(row[column].strip())
 
# Convert string column to integer
def str_column_to_int(dataset, column):
    #store a given column 
    class_values = [row[column] for row in dataset]
    #create an unordered collection with no duplicates, only unique valeus
    unique = set(class_values)
    #init a lookup table
    lookup = dict()
    #for each element in the column
    for i, value in enumerate(unique):
        #add it to our lookup table
        lookup[value] = i
    #the lookup table stores pointers to the strings
    for row in dataset:
        row[column] = lookup[row[column]]
    #return the lookup table
    return lookup

In [None]:
# we will need the recipes data as a 2d matrix

### Decision Tree Algorithm Helper Functions

In [3]:
# Split a dataset into k folds
# the original sample is randomly partitioned into k equal sized subsamples. 
#Of the k subsamples, a single subsample is retained as the validation data 
#for testing the model, and the remaining k − 1 subsamples are used as training data. 
#The cross-validation process is then repeated k times (the folds),
#with each of the k subsamples used exactly once as the validation data.
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    #init 2 empty lists for storing split dataubsets
	left, right = list(), list()
    #for every row
	for row in dataset:
        #if the value at that row is less than the given value
		if row[index] < value:
            #add it to list 1
			left.append(row)
		else:
            #else add it list 2 
			right.append(row)
    #return both lists
	return left, right
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    #how many correct predictions?
	correct = 0
    #for each actual label
	for i in range(len(actual)):
        #if actual matches predicted label
		if actual[i] == predicted[i]:
            #add 1 to the correct iterator
			correct += 1
    #return percentage of predictions that were correct
	return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    #folds are the subsamples used to train and validate model
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
    #for each subsample
	for fold in folds:
        #create a copy of the data
		train_set = list(folds)
        #remove the given subsample
		train_set.remove(fold)
		train_set = sum(train_set, [])
        #init a test set
		test_set = list()
        #add each row in a given subsample to the test set
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
        #get predicted labls
		predicted = algorithm(train_set, test_set, *args)
        #get actual labels
		actual = [row[-1] for row in fold]
        #compare accuracy
		accuracy = accuracy_metric(actual, predicted)
        #add it to scores list, for each fold
		scores.append(accuracy)
    #return all accuracy scores
	return scores
 
 
# Calculate the Gini index for a split dataset
## this is the name of the cost function used to evaluate splits in the dataset.
# this is a measure of how often a randomly chosen element from the set 
#would be incorrectly labeled if it was randomly labeled according to the distribution
#of labels in the subset. Can be computed by summing the probability
#of an item with label i being chosen times the probability 
#of a mistake in categorizing that item. 
#It reaches its minimum (zero) when all cases in the node 
#fall into a single target category.
#A split in the dataset involves one input attribute and one value for that attribute. 
#It can be used to divide training patterns into two groups of rows.
#A Gini score gives an idea of how good a split is by how mixed the classes 
#are in the two groups created by the split. A perfect separation results in 
#a Gini score of 0, whereas the worst case split that results in 50/50 classes 
#in each group results in a Gini score of 1.0 (for a 2 class problem).
#We first need to calculate the proportion of classes in each group.
def gini_index(groups, class_values):
	gini = 0.0
    #for each class
	for class_value in class_values:
        #a random subset of that class
		for group in groups:
			size = len(group)
			if size == 0:
				continue
            #average of all class values
			proportion = [row[-1] for row in group].count(class_value) / float(size)
            #  sum all (p * 1-p) values, this is gini index
			gini += (proportion * (1.0 - proportion))
	return gini
 
# Select the best split point for a dataset
#This is an exhaustive and greedy algorithm
def get_split(dataset, n_features):
    ##Given a dataset, we must check every value on each attribute as a candidate split, 
    #evaluate the cost of the split and find the best possible split we could make.
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features:
		index = randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for row in dataset:
            ##When selecting the best split and using it as a new node for the tree 
            #we will store the index of the chosen attribute, the value of that attribute 
            #by which to split and the two groups of data split by the chosen split point.
            ##Each group of data is its own small dataset of just those rows assigned to the 
            #left or right group by the splitting process. You can imagine how we might split 
            #each group again, recursively as we build out our decision tree.
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    ##Once the best split is found, we can use it as a node in our decision tree.
    ##We will use a dictionary to represent a node in the decision tree as 
    #we can store data by name. 
	return {'index':b_index, 'value':b_value, 'groups':b_groups}
 
# Create a terminal node value

def to_terminal(group):
    #select a class value for a group of rows. 
	outcomes = [row[-1] for row in group]
    #returns the most common output value in a list of rows.
	return max(set(outcomes), key=outcomes.count)
 
#Create child splits for a node or make terminal
#Building a decision tree involves calling the above developed get_split() function over 
#and over again on the groups created for each node.
#New nodes added to an existing node are called child nodes. 
#A node may have zero children (a terminal node), one child (one side makes a prediction directly) 
#or two child nodes. We will refer to the child nodes as left and right in the dictionary representation 
#of a given node.
#Once a node is created, we can create child nodes recursively on each group of data from 
#the split by calling the same function again.
def split(node, max_depth, min_size, n_features, depth):
    #Firstly, the two groups of data split by the node are extracted for use and 
    #deleted from the node. As we work on these groups the node no longer requires access to these data.
	left, right = node['groups']
	del(node['groups'])
    
    #Next, we check if either left or right group of rows is empty and if so we create 
    #a terminal node using what records we do have.
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
    #We then check if we have reached our maximum depth and if so we create a terminal node.
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
    #We then process the left child, creating a terminal node if the group of rows is too small, 
    #otherwise creating and adding the left node in a depth first fashion until the bottom of 
    #the tree is reached on this branch.
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	# process right child
    #The right side is then processed in the same manner, 
    #as we rise back up the constructed tree to the root.
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)
 
#Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
    #Building the tree involves creating the root node and 
	root = get_split(train, n_features)
    #calling the split() function that then calls itself recursively to build out the whole tree.
	split(root, max_depth, min_size, n_features, 1)
	return root
 
# Make a prediction with a decision tree
def predict(node, row):
    #Making predictions with a decision tree involves navigating the 
    #tree with the specifically provided row of data.
    #Again, we can implement this using a recursive function, where the same prediction routine is 
    #called again with the left or the right child nodes, depending on how the split affects the provided data.
    #We must check if a child node is either a terminal value to be returned as the prediction
    #, or if it is a dictionary node containing another level of the tree to be considered.
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']
 
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
	sample = list()
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample

### Main code

In [4]:
# Make a prediction with a list of bagged trees
#responsible for making a prediction with each decision tree and 
#combining the predictions into a single return value. 
#This is achieved by selecting the most common prediction 
#from the list of predictions made by the bagged trees.
def bagging_predict(trees, row):
	predictions = [predict(tree, row) for tree in trees]
	return max(set(predictions), key=predictions.count)
 

In [5]:
# Random Forest Algorithm
#esponsible for creating the samples of the training dataset, training a decision tree on each,
#then making predictions on the test dataset using the list of bagged trees.
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = build_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)
	predictions = [bagging_predict(trees, row) for row in test]
	return(predictions)

Now we need to get the data into a 2d matrix - that is still made up of strings

In [6]:
import csv, os, pandas as pd, pathlib, pprint, json, numpy as np
import bagOfWords as bow
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Make sure json training json file exists

data_directory = os.path.join(os.getcwd(), "Dataset")
train_data_file_path = os.path.join(data_directory, "train.json")
test_data_file_path = os.path.join(data_directory, "test.json")
    
if(not pathlib.Path(train_data_file_path).is_file()):
    raise Exception("Missing train.json file in " + data_directory)

    
if(not pathlib.Path(test_data_file_path).is_file()):
    raise Exception("Missing test.json file in " + data_directory)

In [49]:
# Read JSON training data

with open(train_data_file_path, 'r') as f:
     trainData = pd.read_json(f)
f.closed

with open(test_data_file_path, 'r') as f:
     testData = pd.read_json(f)
f.closed

trainData.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


Section 2: Analysis & Visualisation

In [9]:
%matplotlib inline

In [10]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Count the ingredients in the cuisines

In [12]:
from collections import Counter

counters = {}
for cuisine in trainData['cuisine'].unique():
    counters[cuisine] = Counter()
    indices = (trainData['cuisine'] == cuisine)
    for ingredients in trainData[indices]['ingredients']:
        counters[cuisine].update(ingredients)

In [13]:
for cuisine in trainData['cuisine'].unique():
    print(cuisine)

greek
southern_us
filipino
indian
jamaican
spanish
italian
mexican
chinese
british
thai
vietnamese
cajun_creole
brazilian
french
japanese
irish
korean
moroccan
russian


In [18]:
for cuisine in trainData['cuisine'].unique():
    print ("here is the ingredients for %s" % cuisine)
    print ("here is the items broken out")
    for key,value in counters[cuisine].items():
        print(key, value)
    

here is the ingredients for greek
here is the items broken out
romaine lettuce 39
black olives 31
grape tomatoes 26
garlic 216
pepper 203
purple onion 186
seasoning 3
garbanzo beans 23
feta cheese crumbles 252
ground pork 4
finely chopped fresh parsley 8
onions 185
salt 572
vinegar 3
caul fat 1
minced garlic 68
dried oregano 267
red wine vinegar 99
olive oil 504
boneless chop pork 1
lemon juice 183
orange 12
anise 5
cinnamon sticks 31
unflavored gelatin 2
zinfandel 2
orange blossom honey 3
sugar 77
lemon 129
calimyrna figs 1
clove 14
honey 67
whipping cream 2
plain whole-milk yogurt 5
fresh dill 108
yoghurt 10
myzithra 2
large eggs 72
cheese 15
feta cheese 191
phyllo 12
kefalotyri 9
ground black pepper 221
extra-virgin olive oil 229
hamburger buns 6
paprika 20
chopped fresh mint 68
ground cinnamon 73
balsamic vinegar 24
baby spinach leaves 9
ground lamb 68
dried mint flakes 15
tomatoes 156
dried dillweed 10
red wine 10
lamb 12
plain yogurt 56
pita bread rounds 16
cucumber 187
liquid 2


chopped walnuts 11
orange zest 4
cooked white rice 4
orange juice 12
egg yolks 14
walnuts 16
ground red pepper 10
pita wedges 4
swordfish steaks 3
granulated garlic 3
onion flakes 1
pure vanilla extract 3
sour cherries 1
farro 1
cheddar cheese 3
ricotta cheese 4
minced meat 1
waxy potatoes 2
manchego cheese 1
cherries 1
melted butter 8
processed cheese 1
shredded mozzarella cheese 4
leeks 8
bread crumbs 16
cold water 5
grated orange peel 5
semolina flour 3
cumin 8
beef stock 3
rub 1
lamb loin chops 3
shortening 2
sesame seeds 12
hummus 14
chopped tomatoes 13
cod 1
ground cayenne pepper 3
elbow macaroni 7
lean ground beef 13
pearl barley 2
fresh thyme 3
mixed nuts 2
tenderloin 2
ground white pepper 7
filet 2
sherry vinegar 4
i can't believ it' not butter! made with olive oil spread 1
prebaked pizza crusts 1
dri oregano leaves, crush 2
fresh spinach leaves, rins and pat dry 1
carrots 30
fatfree lowsodium chicken broth 1
chocolate spread 1
toasted almonds 1
roasted hazelnuts 1
roasted pis

boneless skinless chicken 1
pocket bread 1
bamboo shoots 2
hot pepper rings 1
unsalted pistachios 2
dried apple rings 1
buttermilk 1
sweet pepper 2
flat leaf spinach 1
frozen green beans 1
beef stock cubes 1
cooked shrimp 1
Swerve Sweetener 1
2% lowfat greek yogurt 2
passata 2
zesty italian dressing 1
chili powder 3
coriander 2
cilantro sprigs 1
vanilla bean paste 1
stevia extract 1
tilapia fillets 1
mushroom caps 1
watermelon seeds 1
flavored oil 1
pickled beets 1
salad 4
natural pistachios 1
syrup 2
dried apricot 2
double-acting baking powder 1
boneless chicken cutlet 1
egg noodles 1
chicken drumsticks 1
Country CrockÂ® Spread 1
currant 1
canned chicken broth 2
Mezzetta Sliced Greek Kalamata Olives 1
poblano chiles 1
hard-boiled egg 2
bibb lettuce 1
pastry 1
white onion 3
pickling spices 1
olive oil cooking spray 3
cooking oil 2
lean ground meat 1
sliced mushrooms 2
tart shells 1
Tabasco Pepper Sauce 2
dried red chile peppers 1
alfalfa sprouts 1
sliced turkey 1
whole wheat pita pocke

bacon slices 93
bread flour 12
chopped fresh chives 43
biscuits 38
lump crab meat 26
diced celery 19
vegetable juice 1
seafood seasoning 4
vegetable broth 26
applewood smoked bacon 6
red potato 21
corn kernels 46
diced tomatoes 71
sausages 21
jalapeno chilies 107
grated jack cheese 8
scallions 67
ketchup 97
Tabasco Pepper Sauce 63
heavy cream 203
molasses 33
cider vinegar 117
chopped parsley 33
large shrimp 38
Smithfield Ham 2
McCormick Parsley Flakes 2
bread 23
whole wheat flour 23
reduced fat milk 6
frozen blueberries 8
peaches 188
maple syrup 41
ground allspice 44
ground cinnamon 248
cajun seasoning 71
white wine 17
Crystal Farms Butter 2
smoked paprika 41
Crystal Farms Shredded Gouda Cheese 1
raisin bread 2
ground nutmeg 134
sweetened coconut flakes 30
bananas 67
granulated garlic 15
dry mustard 68
dark brown sugar 96
golden brown sugar 16
all purpose unbleached flour 34
roasted chestnuts 1
chopped fresh thyme 31
parmesan cheese 39
chives 25
sour cream 117
powdered sugar 108
aspara

pie dough 9
crÃ¨me fraÃ®che 6
marshmallow creme 9
green chile 12
rice 11
peeled tomatoes 3
marsala wine 1
fleur de sel 4
raw sugar 3
large free range egg 1
vanilla essence 1
banana liqueur 4
swiss steak 1
stick butter 1
firmly packed light brown sugar 28
pork baby back ribs 12
oats 7
light butter 4
toasted almonds 2
clam juice 10
french baguette 1
sorghum syrup 5
apple butter 5
beaten eggs 9
fresh sage 6
chicken bouillon granules 7
reduced fat cheddar cheese 4
prepared mustard 17
Ritz Crackers 3
mushrooms 9
Uncle Ben's Original Converted Brand rice 1
beef consomme 1
raw honey 3
almonds 6
cacao powder 1
corn 35
cocktail sauce 4
tartar sauce 12
noodles 3
fresh rosemary 20
pork shoulder 11
salted butter 26
cherry gelatin 2
rocket leaves 2
crushed pineapples in juice 6
poppy seeds 6
bing cherries 2
rolls 11
halibut 1
dark ale 1
pork ribs 4
fresh blueberries 12
roasted ground cumin 1
Saigon cinnamon 3
collard green leaves 4
cranberries 4
capers 8
fresh chives 21
bread crumb fresh 19
ground 

mild sausage 1
ground sage 5
toasted coconut 2
toasted pumpkinseeds 1
cookies 6
white peppercorns 1
fontina cheese 7
Hidden ValleyÂ® Original Ranch SaladÂ® Dressing & Seasoning Mix 1
condensed cream of chicken soup 10
condensed cream of mushroom soup 9
egg noodles 2
sliced apples 1
whipped cream cheese 2
graham cracker pie crust 2
pineapple juice 16
louisiana hot sauce 1
chicken gizzards 5
grated orange peel 3
sugar cubes 2
compote 1
dry roasted peanuts 6
pork belly 3
chanterelle 2
mint 15
Spring! Water 2
chili 5
vegetable stock 5
beef 7
lime wedges 3
brewed coffee 17
apricots 2
whole grain mustard 7
catsup 3
cake mix 4
whole okra 1
vegan butter 2
vanilla beans 13
sanding sugar 2
fresh corn 7
double cream 1
muscovado sugar 1
base 6
free range egg 3
golden syrup 4
brown mustard seeds 1
sun-dried tomatoes 4
prosciutto 4
decorating sugars 1
baby arugula 1
chocolate syrup 1
chocolate shavings 4
Godiva Chocolate Liqueur 1
marshmallow vodka 1
dates 3
gluten-free breadcrumbs 1
pepper jack 4
p

boneless chicken cutlet 2
drippings 1
broiler-fryers 4
breast of lamb 1
shanks 4
beef shank 1
veal shanks 1
cherry pie filling 1
diet dr. pepper 2
boneless pork shoulder roast 2
no-calorie sweetener 1
cream sherry 1
southern comfort 6
sour mix 1
miniature semisweet chocolate chips 1
Boursin 1
shredded coleslaw mix 4
yellow crookneck squash 1
coarse kosher salt 6
artichok heart marin 1
vegan Worcestershire sauce 1
chopped tomatoes 2
NeufchÃ¢tel 1
chayotes 1
corned beef 1
barbecued pork 3
lemon peel 2
jerusalem artichokes 2
turbot fillets 1
turkey thigh 1
potato rolls 1
oyster mushrooms 3
roasted tomatoes 2
sherry wine 1
sweet pickle relish 6
smoked ham 2
long grain and wild rice mix 3
ladys house seasoning 1
sweet pickle 2
tart shells 3
mandarin orange segments 1
pineapple chunks 3
clementine sections 1
cool whip 7
red grape 2
muffin 2
grated GruyÃ¨re cheese 6
maida flour 1
margarine spread 1
white button mushrooms 2
shrimp stock 2
skinless chicken breasts 1
caramel topping 1
cherry pre

brie cheese 1
twists 2
Angostura bitters 4
anise liqueur 1
pitted date 1
poblano chiles 2
kahlÃºa 1
Braeburn Apple 1
fresh bay leaves 2
low sodium jarred chicken soup base 1
barbecue rub 2
7 Up 1
pattypan squash 1
dipping sauces 2
sandwiches 1
slider buns 1
rye flour 1
drumstick 1
turkey breast 1
chicken schmaltz 1
celery tops 1
cut up chicken 1
italian seasoned dry bread crumbs 1
mandarin oranges 3
top sirloin 1
semolina flour 1
lean ground pork 2
chocolate glaze 1
caramels 2

KeyboardInterrupt: 

In [None]:
#now I want to find out the top 50 ingredients that are not present in other cuisines

In [38]:
cuisine_top50_ingredients = dict()

for cuisine in trainData['cuisine'].unique():
    print(cuisine)
    listoftop50 = []
    for item in counters[cuisine].most_common(50):
        print(item)
        #ingredient_item, amount = item
        #print(ingredient_item, amount)
        listoftop50.append(item)
    cuisine_top50_ingredients[cuisine] = listoftop50
    
print(cuisine_top50_ingredients)

greek
('salt', 572)
('olive oil', 504)
('dried oregano', 267)
('garlic cloves', 254)
('feta cheese crumbles', 252)
('extra-virgin olive oil', 229)
('fresh lemon juice', 222)
('ground black pepper', 221)
('garlic', 216)
('pepper', 203)
('feta cheese', 191)
('cucumber', 187)
('purple onion', 186)
('onions', 185)
('lemon juice', 183)
('tomatoes', 156)
('water', 143)
('lemon', 129)
('fresh parsley', 113)
('fresh dill', 108)
('all-purpose flour', 100)
('red wine vinegar', 99)
('butter', 90)
('black pepper', 89)
('kosher salt', 85)
('greek yogurt', 84)
('fresh oregano', 83)
('eggs', 80)
('kalamata', 78)
('sugar', 77)
('ground cinnamon', 73)
('large eggs', 72)
('pitted kalamata olives', 70)
('minced garlic', 68)
('chopped fresh mint', 68)
('ground lamb', 68)
('honey', 67)
('cooking spray', 67)
('flat leaf parsley', 65)
('eggplant', 61)
('unsalted butter', 59)
('diced tomatoes', 57)
('plain yogurt', 56)
('green onions', 55)
('cherry tomatoes', 51)
('red bell pepper', 48)
('greek style plain yo

('boneless skinless chicken breasts', 294)
chinese
('soy sauce', 1363)
('sesame oil', 915)
('salt', 907)
('corn starch', 906)
('sugar', 824)
('garlic', 763)
('water', 762)
('green onions', 628)
('vegetable oil', 602)
('scallions', 591)
('ginger', 490)
('fresh ginger', 469)
('rice vinegar', 461)
('garlic cloves', 421)
('oil', 338)
('oyster sauce', 333)
('hoisin sauce', 323)
('carrots', 298)
('eggs', 288)
('onions', 280)
('peanut oil', 280)
('light soy sauce', 254)
('Shaoxing wine', 247)
('honey', 241)
('brown sugar', 235)
('dark soy sauce', 230)
('chinese five-spice powder', 223)
('white pepper', 222)
('low sodium soy sauce', 213)
('minced garlic', 212)
('boneless skinless chicken breasts', 198)
('kosher salt', 185)
('peeled fresh ginger', 182)
('ground pork', 179)
('canola oil', 177)
('chicken broth', 165)
('toasted sesame oil', 164)
('sesame seeds', 162)
('chicken stock', 161)
('pepper', 158)
('red bell pepper', 157)
('ground black pepper', 153)
('spring onions', 148)
('large eggs', 1

irish
('salt', 376)
('all-purpose flour', 219)
('butter', 219)
('onions', 132)
('sugar', 120)
('potatoes', 120)
('baking soda', 115)
('baking powder', 112)
('milk', 111)
('carrots', 105)
('water', 96)
('eggs', 90)
('pepper', 87)
('unsalted butter', 87)
('large eggs', 77)
('buttermilk', 75)
('ground black pepper', 66)
('cabbage', 54)
('garlic', 51)
('olive oil', 50)
('cooking spray', 48)
('black pepper', 44)
('vegetable oil', 43)
('flour', 42)
('raisins', 40)
('fresh parsley', 40)
('heavy cream', 39)
('leeks', 38)
('brown sugar', 38)
('whole wheat flour', 38)
('bacon', 36)
('kosher salt', 33)
('granulated sugar', 31)
('garlic cloves', 31)
('white sugar', 31)
('Irish whiskey', 30)
('russet potatoes', 30)
('vanilla extract', 29)
('ground cinnamon', 28)
('beer', 28)
('bay leaves', 27)
('green onions', 25)
('yukon gold potatoes', 24)
('corned beef', 24)
('chicken stock', 23)
('beef broth', 23)
('beef brisket', 23)
('bay leaf', 23)
('red potato', 23)
('margarine', 22)
korean
('soy sauce', 42

In [39]:
allthetopingredients=[]
for cuisine in trainData['cuisine'].unique():
    for item in cuisine_top50_ingredients[cuisine]:
        ingredient, amount = item
        print (ingredient)
        allthetopingredients.append(ingredient)

print(len(allthetopingredients))


salt
olive oil
dried oregano
garlic cloves
feta cheese crumbles
extra-virgin olive oil
fresh lemon juice
ground black pepper
garlic
pepper
feta cheese
cucumber
purple onion
onions
lemon juice
tomatoes
water
lemon
fresh parsley
fresh dill
all-purpose flour
red wine vinegar
butter
black pepper
kosher salt
greek yogurt
fresh oregano
eggs
kalamata
sugar
ground cinnamon
large eggs
pitted kalamata olives
minced garlic
chopped fresh mint
ground lamb
honey
cooking spray
flat leaf parsley
eggplant
unsalted butter
diced tomatoes
plain yogurt
green onions
cherry tomatoes
red bell pepper
greek style plain yogurt
zucchini
plum tomatoes
milk
salt
butter
all-purpose flour
sugar
large eggs
baking powder
water
unsalted butter
milk
buttermilk
eggs
pepper
onions
ground black pepper
vanilla extract
vegetable oil
baking soda
kosher salt
olive oil
garlic cloves
black pepper
flour
brown sugar
granulated sugar
garlic
ground cinnamon
garlic powder
cayenne pepper
yellow corn meal
chopped pecans
heavy cream
chic

tapioca flour
ice cubes
bell pepper
salt
sugar
all-purpose flour
unsalted butter
olive oil
butter
water
large eggs
garlic cloves
ground black pepper
onions
shallots
extra-virgin olive oil
dry white wine
fresh lemon juice
large egg yolks
carrots
vanilla extract
garlic
heavy cream
pepper
milk
eggs
black pepper
whipping cream
dijon mustard
fresh parsley
cooking spray
kosher salt
bay leaf
leeks
large egg whites
tomatoes
whole milk
flat leaf parsley
egg yolks
bay leaves
granulated sugar
powdered sugar
chopped fresh thyme
freshly ground pepper
vegetable oil
dried thyme
sea salt
thyme sprigs
fresh thyme
white wine vinegar
capers
lemon juice
confectioners sugar
soy sauce
salt
mirin
sugar
water
sake
rice vinegar
vegetable oil
scallions
ginger
sesame oil
carrots
green onions
garlic
onions
oil
eggs
sesame seeds
fresh ginger
dashi
corn starch
garlic cloves
nori
kosher salt
large eggs
honey
ground black pepper
butter
shiitake
all-purpose flour
sushi rice
cucumber
low sodium soy sauce
white miso
pep

In [40]:
#now count how many times each one appears

counttopingredients = Counter()
for word in allthetopingredients:
    counttopingredients[word] += 1

print(counttopingredients)


Counter({'salt': 20, 'garlic cloves': 20, 'garlic': 20, 'onions': 20, 'water': 20, 'sugar': 20, 'olive oil': 19, 'ground black pepper': 19, 'pepper': 19, 'kosher salt': 18, 'vegetable oil': 17, 'butter': 16, 'black pepper': 16, 'eggs': 16, 'carrots': 16, 'all-purpose flour': 14, 'green onions': 14, 'tomatoes': 12, 'large eggs': 12, 'red bell pepper': 11, 'unsalted butter': 10, 'brown sugar': 10, 'oil': 10, 'fresh parsley': 9, 'minced garlic': 9, 'milk': 9, 'ginger': 9, 'flour': 8, 'honey': 7, 'soy sauce': 7, 'fresh ginger': 7, 'fresh lemon juice': 6, 'purple onion': 6, 'lemon juice': 6, 'lemon': 6, 'ground cinnamon': 6, 'diced tomatoes': 6, 'baking powder': 6, 'cayenne pepper': 6, 'chicken broth': 6, 'white sugar': 6, 'corn starch': 6, 'bay leaves': 6, 'potatoes': 6, 'chopped cilantro fresh': 6, 'scallions': 6, 'extra-virgin olive oil': 5, 'cucumber': 5, 'cooking spray': 5, 'flat leaf parsley': 5, 'vanilla extract': 5, 'granulated sugar': 5, 'heavy cream': 5, 'paprika': 5, 'coconut mil

In [37]:
ingredientstodiscard=[]
for key,value in counttopingredients.items():
    print(key, value)
    if value > 5:
        print("too popular!! %s" % key)
        ingredientstodiscard.append(key)
        
print(len(ingredientstodiscard))

salt 20
too popular!! salt
olive oil 14
too popular!! olive oil
dried oregano 2
garlic cloves 18
too popular!! garlic cloves
feta cheese crumbles 1
extra-virgin olive oil 5
fresh lemon juice 3
ground black pepper 15
too popular!! ground black pepper
garlic 18
too popular!! garlic
pepper 15
too popular!! pepper
feta cheese 1
cucumber 2
purple onion 2
onions 20
too popular!! onions
lemon juice 1
tomatoes 7
too popular!! tomatoes
water 20
too popular!! water
lemon 3
fresh parsley 4
fresh dill 2
all-purpose flour 8
too popular!! all-purpose flour
red wine vinegar 1
butter 12
too popular!! butter
black pepper 10
too popular!! black pepper
kosher salt 8
too popular!! kosher salt
sugar 16
too popular!! sugar
large eggs 8
too popular!! large eggs
baking powder 4
unsalted butter 6
too popular!! unsalted butter
milk 6
too popular!! milk
buttermilk 2
eggs 12
too popular!! eggs
vanilla extract 3
vegetable oil 15
too popular!! vegetable oil
baking soda 3
flour 4
brown sugar 6
too popular!! brown su

In [41]:
ingredients_in_over5recipes = []
    
for item in allthetopingredients:
    if item in ingredientstodiscard:
        print ("losing this item %s " % item)
    else:
        ingredients_in_over5recipes.append(item)
        
print(len(ingredients_in_over5recipes))
        

losing this item salt 
losing this item olive oil 
losing this item garlic cloves 
losing this item ground black pepper 
losing this item garlic 
losing this item pepper 
losing this item onions 
losing this item tomatoes 
losing this item water 
losing this item all-purpose flour 
losing this item butter 
losing this item black pepper 
losing this item kosher salt 
losing this item eggs 
losing this item sugar 
losing this item large eggs 
losing this item unsalted butter 
losing this item green onions 
losing this item milk 
losing this item salt 
losing this item butter 
losing this item all-purpose flour 
losing this item sugar 
losing this item large eggs 
losing this item water 
losing this item unsalted butter 
losing this item milk 
losing this item eggs 
losing this item pepper 
losing this item onions 
losing this item ground black pepper 
losing this item vegetable oil 
losing this item kosher salt 
losing this item olive oil 
losing this item garlic cloves 
losing this item

losing this item salt 
losing this item all-purpose flour 
losing this item butter 
losing this item onions 
losing this item sugar 
losing this item milk 
losing this item carrots 
losing this item water 
losing this item eggs 
losing this item pepper 
losing this item unsalted butter 
losing this item large eggs 
losing this item ground black pepper 
losing this item garlic 
losing this item olive oil 
losing this item black pepper 
losing this item vegetable oil 
losing this item brown sugar 
losing this item kosher salt 
losing this item garlic cloves 
losing this item green onions 
losing this item soy sauce 
losing this item garlic 
losing this item green onions 
losing this item sugar 
losing this item salt 
losing this item water 
losing this item onions 
losing this item carrots 
losing this item garlic cloves 
losing this item vegetable oil 
losing this item brown sugar 
losing this item eggs 
losing this item fresh ginger 
losing this item pepper 
losing this item ground bla

In [None]:
#now we need a 2d matrix (dataframe) of counts of these ingredients in each recipe

In [None]:
# go through the recipes and see if the top ingredients are in there

# id
# cuisine
# ingredients

#new row of the sparse matrix needs to be id cuisine ingredient1 ingredient2 ingredient3

In [None]:
recipe=[]
occurrences=[]
for key, value in countwordsappearing.items():
    ingredientword.append(str(key))
    occurrences.append(value)

df = pd.DataFrame({'Ingredient': ingredientword, 'Occurrences':occurrences}).sort_values(
    by=['Occurrences'],ascending=False)
df

### Going back to the random forest construction

In [None]:
# Test the random forest algorithm
seed(1)
# load and prepare data
filename = 'sonar.all-data.csv'
dataset = load_csv(filename)

In [None]:
# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
for n_trees in [1, 5, 10]:
	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
	print('Trees: %d' % n_trees)
	print('Scores: %s' % scores)
	print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))