# Preprocess training  and test dataset

- make all letters lowercase for all ingredients
- removes extra whitespaces
- word stemming for ingredients
- remove certain words from ingredients
- remove numbers from ingredients
- remove certain special characters from ingredients
- remove useless adjectives from ingredients such as large, fat and low.
- remove the common ingredients that exist in all cuisines such as salt and water.

In [13]:
# ONLY REQUIRED FOR FIRST TIME RUNNERS
#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [190]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.stem.wordnet import WordNetLemmatizer
import re
import itertools
import os.path

def remove_numbers(ingredient):    
    return [[re.sub("\d+", "", x) for x in y] for y in ingredient]

def remove_special_chars(ingredient):   
    ingredient = [[x.replace("-", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("&", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("'", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("''", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("%", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("!", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("(", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace(")", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("/", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace("/", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace(",", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace(".", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace(u"\u2122", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace(u"\u00AE", " ") for x in y] for y in ingredient] 
    ingredient = [[x.replace(u"\u2019", " ") for x in y] for y in ingredient] 

    return ingredient
    
def make_lowercase(ingredient):    
    return [[x.lower() for x in y] for y in ingredient]

def remove_extra_whitespace(ingredient):    
    return [[re.sub( '\s+', ' ', x).strip() for x in y] for y in ingredient] 
    
    
def stem_words(ingredient):    
    lmtzr = WordNetLemmatizer()
    def word_by_word(strng):
        return " ".join(["".join(lmtzr.lemmatize(w)) for w in strng.split()])
    return [[word_by_word(x) for x in y] for y in ingredient] 
    
    
def remove_units(ingredient):    
    remove_list = ['g', 'lb', 's', 'n']
        
    def check_word(strng):
        
        s = strng.split()
        resw  = [word for word in s if word.lower() not in remove_list]
        
        return ' '.join(resw)

    return [[check_word(x) for x in y] for y in ingredient] 

- Read data into panda dataFrame.

In [125]:
import csv, os, pandas as pd, pathlib, pprint, json, numpy as np
import bagOfWords as bow
from sklearn.feature_extraction.text import CountVectorizer

In [214]:
# Make sure json training json file exists

data_directory = os.path.join(os.getcwd(), "Dataset")
train_data_file_path = os.path.join(data_directory, "train.json")
test_data_file_path = os.path.join(data_directory, "test.json")
    
if(not pathlib.Path(train_data_file_path).is_file()):
    raise Exception("Missing train.json file in " + data_directory)

    
if(not pathlib.Path(test_data_file_path).is_file()):
    raise Exception("Missing test.json file in " + data_directory)

In [237]:
# Read JSON training data

with open(train_data_file_path, 'r') as f:
     trainData = pd.read_json(f)
f.closed

with open(test_data_file_path, 'r') as f:
     testData = pd.read_json(f)
f.closed

trainData.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


# section 2 : preprocessing data:

- Change the text case : 
Data is often received in irregular formats. For example: ‘Milk’ & ‘milk’. Both mean the same thing but is represented in a different manner. Therefore, it is helpful to change the case of text. Either to upper or lower case. I changed to be lower case.

- Remove numbers from ingredients.
- Remove special chars such as -,&
- Remove extra whitespace
- Remove units such as 'g', 'lb', 's', 'n'
- stemming for ingredients.

In [215]:
trainData['ingredients'] = make_lowercase(trainData['ingredients'])
trainData['ingredients'] = remove_numbers(trainData['ingredients'])
trainData['ingredients'] = remove_special_chars(trainData['ingredients'])
trainData['ingredients'] = remove_extra_whitespace(trainData['ingredients'])
trainData['ingredients'] = remove_units(trainData['ingredients'])
trainData['ingredients'] = stem_words(trainData['ingredients'])


testData['ingredients'] = make_lowercase(testData['ingredients'])
testData['ingredients'] = remove_numbers(testData['ingredients'])
testData['ingredients'] = remove_special_chars(testData['ingredients'])
testData['ingredients'] = remove_extra_whitespace(testData['ingredients'])
testData['ingredients'] = remove_units(testData['ingredients'])
testData['ingredients'] = stem_words(testData['ingredients'])


# We can also create a word cloud to check the most frequent terms. It is easy to build and gives an enhanced understanding of ingredients in this data. 

I used NLTK library in order to retrieve only the adjective words in the list of ingredients because there will be some adjectives without strong meaning such as large, low and fat. On the other hand, there will be some adjective words with a strong meaning such as Mexican, Italian, and Chinese which may mean the type of cuisine.

In [103]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import nltk


text=""
i =0 
for ingredients_list in trainData['ingredients']:
    i= i+1
    text +=",".join(str(x) for x in ingredients_list)
    if i==1500:  
        break

from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=30, 
        scale=3,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10, 10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
    
ingredients_list = text.split(",")
text =''
for ingredient in ingredients_list:
    ingredient = nltk.word_tokenize(ingredient)
    tagged = nltk.tag.pos_tag(ingredient)
    for item in tagged:
        if item[1]== 'JJ':
            text = text +" "+item[0]
show_wordcloud(text)

ModuleNotFoundError: No module named 'wordcloud'

- and also do the same for test dataset.

In [None]:
text=""
i =0 
ingredients_list = []
for ingredients_list in testData['ingredients']:
    i= i+1
    text +=",".join(str(x) for x in ingredients_list)
    if i==1500:  
        break
        
ingredients_list = text.split(",")
text =''
for ingredient in ingredients_list:
    ingredient = nltk.word_tokenize(ingredient)
    tagged = nltk.tag.pos_tag(ingredient)
    for item in tagged:
        if item[1]== 'JJ':
            text = text +" "+item[0]
show_wordcloud(text)

Advanced data cleaning could be happened by removing adjective terms that do not have a strong meaning such as large, fresh and fat. 

In [238]:
stop_adjective_words = ['large','low','fat','free','skim','fresh','sliced','light','flat']
ingredients = []

#trainData['ingredients']
i =1
for ingredients_single_cuisine in trainData['ingredients'] :
    temp_list=[]
    for item in ingredients_single_cuisine:
        if ' ' in item:
            ing_single_list = item.split(' ')
            temp = ''
            for single_term in ing_single_list:
                if single_term in stop_adjective_words:
                    print("word has been removed form recipe ",i,"= ",single_term)
                else:
                    temp =temp+" "+single_term
                    temp=temp.strip()
            temp_list.append(temp)
        else:
            temp_list.append(item)
    ingredients.append(temp_list)
    i =i +1

word has been removed form recipe  6 =  fresh
word has been removed form recipe  7 =  flat
word has been removed form recipe  9 =  fresh
word has been removed form recipe  9 =  fresh
word has been removed form recipe  10 =  fresh
word has been removed form recipe  10 =  flat
word has been removed form recipe  12 =  low
word has been removed form recipe  12 =  fresh
word has been removed form recipe  13 =  fresh
word has been removed form recipe  13 =  flat
word has been removed form recipe  14 =  fresh
word has been removed form recipe  15 =  fresh
word has been removed form recipe  15 =  fat
word has been removed form recipe  15 =  free
word has been removed form recipe  15 =  fresh
word has been removed form recipe  15 =  fat
word has been removed form recipe  15 =  free
word has been removed form recipe  20 =  sliced
word has been removed form recipe  20 =  fresh
word has been removed form recipe  21 =  fresh
word has been removed form recipe  21 =  fresh
word has been removed form 

- and also do the same for test dataset.

In [239]:
stop_adjective_words = ['large','low','fat','free','skim','fresh','sliced','light','flat']
ingredients_test = []

#trainData['ingredients']
i =1
for ingredients_single_cuisine in testData['ingredients'] :
    temp_list=[]
    for item in ingredients_single_cuisine:
        if ' ' in item:
            ing_single_list = item.split(' ')
            temp = ''
            for single_term in ing_single_list:
                if single_term in stop_adjective_words:
                    print("word has been removed form recipe ",i,"= ",single_term)
                else:
                    temp =temp+" "+single_term
                    temp=temp.strip()
            temp_list.append(temp)
        else:
            temp_list.append(item)
    ingredients_test.append(temp_list)
    i =i +1

word has been removed form recipe  2 =  light
word has been removed form recipe  4 =  flat
word has been removed form recipe  8 =  large
word has been removed form recipe  9 =  fresh
word has been removed form recipe  13 =  large
word has been removed form recipe  13 =  fresh
word has been removed form recipe  16 =  fresh
word has been removed form recipe  16 =  skim
word has been removed form recipe  21 =  fresh
word has been removed form recipe  21 =  large
word has been removed form recipe  23 =  fat
word has been removed form recipe  23 =  free
word has been removed form recipe  25 =  fresh
word has been removed form recipe  26 =  low
word has been removed form recipe  28 =  fresh
word has been removed form recipe  28 =  large
word has been removed form recipe  32 =  fresh
word has been removed form recipe  32 =  fresh
word has been removed form recipe  32 =  fresh
word has been removed form recipe  35 =  fresh
word has been removed form recipe  37 =  fresh
word has been removed fo

In [240]:
trainData['ingredients'] = ingredients
testData['ingredients'] = ingredients_test

- Finally, we can remove the high-frequency ingredients that might be common in all cuisines

From the table in the previous file(david benchmark classifiers), it will be helpful to explore data and discover the common ingredients in all cuisines.

In [241]:
from collections import Counter

counters = {}
for cuisine in trainData['cuisine'].unique():
    counters[cuisine] = Counter()
    indices = (trainData['cuisine'] == cuisine)
    for ingredients in trainData[indices]['ingredients']:
        counters[cuisine].update(ingredients)

In [242]:
top10 = pd.DataFrame([[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters],
            index=[cuisine for cuisine in counters],
            columns=['top{}'.format(i) for i in range(1, 11)])
top10

Unnamed: 0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
greek,salt,olive oil,lemon juice,garlic cloves,dried oregano,feta cheese crumbles,extra-virgin olive oil,ground black pepper,garlic,pepper
southern_us,salt,eggs,butter,all-purpose flour,sugar,baking powder,water,milk,unsalted butter,buttermilk
filipino,salt,garlic,water,onions,soy sauce,pepper,oil,sugar,carrots,ground black pepper
indian,salt,onions,ginger,garam masala,water,ground turmeric,garlic,cumin seed,ground cumin,vegetable oil
jamaican,salt,onions,water,garlic,thyme,ground allspice,pepper,scallions,garlic cloves,dried thyme
spanish,salt,olive oil,garlic cloves,extra-virgin olive oil,onions,eggs,water,tomatoes,ground black pepper,red bell pepper
italian,salt,olive oil,garlic cloves,grated parmesan cheese,garlic,ground black pepper,extra-virgin olive oil,eggs,onions,water
mexican,salt,onions,ground cumin,garlic,chopped cilantro,olive oil,chili powder,lime juice,jalapeno chilies,sour cream
chinese,soy sauce,ginger,sesame oil,salt,corn starch,sugar,garlic,water,green onions,vegetable oil
british,salt,eggs,all-purpose flour,butter,milk,unsalted butter,sugar,onions,baking powder,water


- It can be said that ingredients such as salt, water, and onion are useless in order to predict the type of cuisine, so removing such as these ingredients might be improved the accuracy of the prediction model.

In [243]:
stop_ingredients_words = ['salt','water','onion']
ingredients = []

#trainData['ingredients']
i =1
for ingredients_single_cuisine in trainData['ingredients'] :
    temp_list=[]
    for item in ingredients_single_cuisine:
        if ' ' in item:
            ing_single_list = item.split(' ')
            temp = ''
            for single_term in ing_single_list:
                if single_term in stop_ingredients_words:
                    print("word has been removed form recipe ",i,"= ",single_term)
                else:
                    temp =temp+" "+single_term
                    temp=temp.strip()
            temp_list.append(temp)
        else:
            temp_list.append(item)
    ingredients.append(temp_list)
    i =i +1

word has been removed form recipe  1 =  onion
word has been removed form recipe  3 =  onion
word has been removed form recipe  7 =  salt
word has been removed form recipe  9 =  onion
word has been removed form recipe  10 =  salt
word has been removed form recipe  13 =  salt
word has been removed form recipe  14 =  salt
word has been removed form recipe  22 =  salt
word has been removed form recipe  22 =  onion
word has been removed form recipe  31 =  water
word has been removed form recipe  31 =  salt
word has been removed form recipe  33 =  salt
word has been removed form recipe  33 =  onion
word has been removed form recipe  45 =  onion
word has been removed form recipe  51 =  onion
word has been removed form recipe  53 =  onion
word has been removed form recipe  56 =  onion
word has been removed form recipe  56 =  water
word has been removed form recipe  56 =  onion
word has been removed form recipe  56 =  salt
word has been removed form recipe  58 =  salt
word has been removed form

- and also do the same for test dataset.

In [244]:
stop_ingredients_words = ['salt','water','onion']
ingredients_test = []

#trainData['ingredients']
i =1
for ingredients_single_cuisine in testData['ingredients'] :
    temp_list=[]
    for item in ingredients_single_cuisine:
        if ' ' in item:
            ing_single_list = item.split(' ')
            temp = ''
            for single_term in ing_single_list:
                if single_term in stop_ingredients_words:
                    print("word has been removed form recipe ",i,"= ",single_term)
                else:
                    temp =temp+" "+single_term
                    temp=temp.strip()
            temp_list.append(temp)
        else:
            temp_list.append(item)
    ingredients_test.append(temp_list)
    i =i +1

word has been removed form recipe  4 =  onion
word has been removed form recipe  13 =  onion
word has been removed form recipe  20 =  water
word has been removed form recipe  23 =  salt
word has been removed form recipe  24 =  salt
word has been removed form recipe  26 =  salt
word has been removed form recipe  27 =  water
word has been removed form recipe  28 =  salt
word has been removed form recipe  31 =  salt
word has been removed form recipe  31 =  onion
word has been removed form recipe  34 =  salt
word has been removed form recipe  35 =  salt
word has been removed form recipe  37 =  salt
word has been removed form recipe  39 =  salt
word has been removed form recipe  42 =  salt
word has been removed form recipe  44 =  onion
word has been removed form recipe  44 =  onion
word has been removed form recipe  45 =  water
word has been removed form recipe  47 =  salt
word has been removed form recipe  48 =  water
word has been removed form recipe  52 =  salt
word has been removed form

In [245]:
trainData['ingredients'] = ingredients
testData['ingredients'] = ingredients_test

## Classifier

#### Varying stop words
78.30 / 78.32 / 79.35 on 37/42/68 with full pre-proc pipeline <br>
78.27 / 78.30 / 79.17 on 37/42/68 with pre-proc but NO STOP WORDS <br>
78.14 / 78.29 / 79.26 on 37/42/68 with pre-proc and adjective stop words only <br>
78.32 / 78.42 / 79.23 on 37/42/68 with pre-proc and common ingredient stop words only <br>

#### Varying preprocessing
78.30 / 78.32 / 79.35 on 37/42/68 with full pre-proc pipeline <br>
78.38 / 78.20 / 79.25 on 37/42/68 with no stemming <br>
78.30 / 78.32 / 79.33 on 37/42/68 with no unit removal <br>
78.30 / 78.32 / 79.35 on 37/42/68 with no whitespace removal <br>
78.25 / 78.33 / 79.35 on 37/42/68 with no special char removal <br>
78.15 / 78.28 / 79.27 on 37/42/68 with no upper case removal <br>
78.32 / 78.32 / 79.32 on 37/42/68 with no numeric removal <br>
78.35 / 78.18 / 79.26 on 37/42/68 with NO PRE-PROCESSING (lowercase is applied by CV) <br>

In [246]:
trainData.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [255]:
from sklearn.model_selection import train_test_split


trainData['all_ingredients'] = trainData['ingredients'].map(";".join)
cuisines = trainData['cuisine'].value_counts().index

# train-test split on fold 37
X_train, X_test, y_train, y_test = train_test_split(trainData['all_ingredients']
                                                    , trainData['cuisine'], test_size=0.2, random_state=68)

enc = LabelEncoder()
y_train_proc = enc.fit_transform(y_train.values)
y_test_proc = enc.transform(y_test.values)

In [256]:
# this is the part that should be commented out and replaced with feature extraction pipeline
cv = CountVectorizer()
X_train_proc = cv.fit_transform(X_train.values)
X_test_proc = cv.transform(X_test.values)
X_train_proc.shape

(31819, 2932)

In [257]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(X_train_proc, y_train_proc)

logistic.score(X_test_proc, y_test_proc)

0.792583280955374

In [250]:
#enc.inverse_transform([x for x in range(20)])

In [251]:
from sklearn.metrics import classification_report

log_pred = logistic.predict(X_test_proc)
print(classification_report(y_test_proc, log_pred, target_names=enc.inverse_transform([x for x in range(20)])))

              precision    recall  f1-score   support

   brazilian       0.74      0.50      0.60       102
     british       0.59      0.38      0.46       184
cajun_creole       0.81      0.70      0.75       327
     chinese       0.80      0.84      0.82       529
    filipino       0.77      0.58      0.66       171
      french       0.59      0.63      0.61       508
       greek       0.78      0.69      0.74       229
      indian       0.87      0.87      0.87       605
       irish       0.67      0.51      0.58       121
     italian       0.79      0.91      0.84      1550
    jamaican       0.90      0.71      0.79       117
    japanese       0.82      0.69      0.75       276
      korean       0.81      0.76      0.79       165
     mexican       0.90      0.93      0.91      1331
    moroccan       0.86      0.75      0.80       152
     russian       0.61      0.38      0.47       104
 southern_us       0.68      0.81      0.74       828
     spanish       0.69    