# Kaggle competition: https://www.kaggle.com/c/whats-cooking-kernels-only

# Table of contents:
[CLean Training data](#Clean-Training-data)

[Build FT table](#Build-FT-table)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
%matplotlib inline
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_json('data/train.json')
df.shape
type(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
cuisine        39774 non-null object
id             39774 non-null int64
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


## Clean Training data

In [3]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
df.set_index("id", inplace=True)

In [5]:
df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [6]:
df["cuisine"].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [7]:
df["ingredients"].head()

id
10259    [romaine lettuce, black olives, grape tomatoes...
25693    [plain flour, ground pepper, salt, tomatoes, g...
20130    [eggs, pepper, salt, mayonaise, cooking oil, g...
22213                  [water, vegetable oil, wheat, salt]
13162    [black pepper, shallots, cornflour, cayenne pe...
Name: ingredients, dtype: object

In [8]:
df["ingredients_clean"] = df["ingredients"].astype(str)
df["ingredients_clean"] = df["ingredients_clean"].str.replace(" ","")
df["ingredients_clean"] = df["ingredients_clean"].str.replace(","," ")

In [9]:
df["ingredients_clean"].head()

id
10259    ['romainelettuce' 'blackolives' 'grapetomatoes...
25693    ['plainflour' 'groundpepper' 'salt' 'tomatoes'...
20130    ['eggs' 'pepper' 'salt' 'mayonaise' 'cookingoi...
22213              ['water' 'vegetableoil' 'wheat' 'salt']
13162    ['blackpepper' 'shallots' 'cornflour' 'cayenne...
Name: ingredients_clean, dtype: object

In [10]:
df["ingredients_clean"] = df["ingredients_clean"].str.replace("salt","")

In [11]:
df["ingredients_clean"].head()

id
10259    ['romainelettuce' 'blackolives' 'grapetomatoes...
25693    ['plainflour' 'groundpepper' '' 'tomatoes' 'gr...
20130    ['eggs' 'pepper' '' 'mayonaise' 'cookingoil' '...
22213                  ['water' 'vegetableoil' 'wheat' '']
13162    ['blackpepper' 'shallots' 'cornflour' 'cayenne...
Name: ingredients_clean, dtype: object

## Build FT table

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer()
cvec.fit(np.array(df["ingredients_clean"]))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
len(cvec.get_feature_names())

6792

In [14]:
cuisines = df["cuisine"].value_counts().index.tolist()
cuisines

['italian',
 'mexican',
 'southern_us',
 'indian',
 'chinese',
 'french',
 'cajun_creole',
 'thai',
 'japanese',
 'greek',
 'spanish',
 'korean',
 'vietnamese',
 'moroccan',
 'british',
 'filipino',
 'irish',
 'jamaican',
 'russian',
 'brazilian']

In [15]:
doc_matrix = cvec.transform(df["ingredients_clean"])
tf = np.sum(doc_matrix,axis=0)
total = np.squeeze(np.asarray(tf))
term_freq_df = pd.DataFrame([total],columns=cvec.get_feature_names()).transpose()
term_freq_df.columns = ["total"]
for each in cuisines:
    print(each)
    doc_matrix = cvec.transform(df["ingredients_clean"][df["cuisine"] == each])
    tf = np.sum(doc_matrix,axis=0)
    df_new = np.squeeze(np.asarray(tf))
    term_freq_df[each] = pd.DataFrame([df_new],columns=cvec.get_feature_names()).transpose()

italian
mexican
southern_us
indian
chinese
french
cajun_creole
thai
japanese
greek
spanish
korean
vietnamese
moroccan
british
filipino
irish
jamaican
russian
brazilian


In [16]:
term_freq_df.info()
term_freq_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 6792 entries, 10oz to zucchiniblossoms
Data columns (total 21 columns):
total           6792 non-null int64
italian         6792 non-null int64
mexican         6792 non-null int64
southern_us     6792 non-null int64
indian          6792 non-null int64
chinese         6792 non-null int64
french          6792 non-null int64
cajun_creole    6792 non-null int64
thai            6792 non-null int64
japanese        6792 non-null int64
greek           6792 non-null int64
spanish         6792 non-null int64
korean          6792 non-null int64
vietnamese      6792 non-null int64
moroccan        6792 non-null int64
british         6792 non-null int64
filipino        6792 non-null int64
irish           6792 non-null int64
jamaican        6792 non-null int64
russian         6792 non-null int64
brazilian       6792 non-null int64
dtypes: int64(21)
memory usage: 1.1+ MB


Unnamed: 0,total,italian,mexican,southern_us,indian,chinese,french,cajun_creole,thai,japanese,...,spanish,korean,vietnamese,moroccan,british,filipino,irish,jamaican,russian,brazilian
10oz,5,3,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,3,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14oz,2,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15oz,3,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1inchthick,2,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Visualize frequencies

In [17]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def f(kitchen):
    y_pos = np.arange(50)
    plt.figure(figsize=(20,15))
    plt.barh(y_pos, term_freq_df.sort_values(by=kitchen, ascending=False)[kitchen][:50], align='center', alpha=0.5)
    plt.yticks(y_pos, term_freq_df.sort_values(by=kitchen, ascending=False)[kitchen][:50].index,rotation='horizontal')
    plt.ylabel('Token')
    plt.xlabel('Frequency')
    plt.title('Top 50 tokens in cuisine')

In [18]:
interact(f, kitchen={'total':'total','italian':'italian', 'mexican': 'mexican', 'southern_us': 'southern_us', 'indian': 'indian',\
 'chinese': 'chinese', 'french':'french', 'cajun_creole':'cajun_creole',  'thai':'thai', 'japanese': 'japanese', \
 'greek': 'greek', 'spanish': 'spanish', 'korean': 'korean', 'vietnamese': 'vietnamese', 'moroccan':'moroccan',\
 'british': 'british', 'filipino': 'filipino', 'irish': 'irish', 'jamaican': 'jamaican', 'russian': 'russian', \
 'brazilian': 'brazilian'});



interactive(children=(Dropdown(description='kitchen', options={'total': 'total', 'italian': 'italian', 'mexica…

In [19]:
for each in cuisines:
    term_freq_df[each+"_support"] = (term_freq_df[each] / term_freq_df["total"])*100

In [20]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def f(kitchen):
    y_pos = np.arange(50)
    plt.figure(figsize=(20,15))
    plt.barh(y_pos, term_freq_df.sort_values(by=kitchen+"_support", ascending=False)[kitchen+"_support"][:50], align='center', alpha=0.5)
    plt.yticks(y_pos, term_freq_df.sort_values(by=kitchen+"_support", ascending=False)[kitchen+"_support"][:50].index,rotation='horizontal')
    plt.ylabel('Token')
    plt.xlabel('Support (%)')
    plt.title('Top 50 tokens in cuisine')

In [21]:
interact(f, kitchen={'italian':'italian', 'mexican': 'mexican', 'southern_us': 'southern_us', 'indian': 'indian',\
                     'chinese': 'chinese', 'french':'french', 'cajun_creole':'cajun_creole',  'thai':'thai',\
                     'japanese': 'japanese', 'greek': 'greek', 'spanish': 'spanish', 'korean': 'korean', \
                     'vietnamese': 'vietnamese', 'moroccan':'moroccan', 'british': 'british', \
                     'filipino': 'filipino', 'irish': 'irish', 'jamaican': 'jamaican', 'russian': 'russian', \
                      'brazilian': 'brazilian'});

interactive(children=(Dropdown(description='kitchen', options={'italian': 'italian', 'mexican': 'mexican', 'so…

In [22]:
for each in cuisines:
    term_freq_df[each+"_coverage"] = (term_freq_df[each] / term_freq_df[each].sum())*100

In [23]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def f(kitchen):
    y_pos = np.arange(50)
    plt.figure(figsize=(20,15))
    plt.barh(y_pos, term_freq_df.sort_values(by=kitchen+"_coverage", ascending=False)[kitchen+"_coverage"][:50], align='center', alpha=0.5)
    plt.yticks(y_pos, term_freq_df.sort_values(by=kitchen+"_coverage", ascending=False)[kitchen+"_coverage"][:50].index,rotation='horizontal')
    plt.ylabel('Token')
    plt.xlabel('Coverage (%)')
    plt.title('Top 50 tokens in cuisine')

In [24]:
interact(f, kitchen={'italian':'italian', 'mexican': 'mexican', 'southern_us': 'southern_us', 'indian': 'indian',\
                     'chinese': 'chinese', 'french':'french', 'cajun_creole':'cajun_creole',  'thai':'thai',\
                     'japanese': 'japanese', 'greek': 'greek', 'spanish': 'spanish', 'korean': 'korean', \
                     'vietnamese': 'vietnamese', 'moroccan':'moroccan', 'british': 'british', \
                     'filipino': 'filipino', 'irish': 'irish', 'jamaican': 'jamaican', 'russian': 'russian', \
                      'brazilian': 'brazilian'});

interactive(children=(Dropdown(description='kitchen', options={'italian': 'italian', 'mexican': 'mexican', 'so…

In [25]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def f(support,kitchen):
    support
    y_pos = np.arange(50)
    plt.figure(figsize=(20,15))
    plt.barh(y_pos, term_freq_df[term_freq_df[kitchen+"_support"]>support].sort_values(by=kitchen+"_coverage", ascending=False)[kitchen+"_coverage"][:50], align='center', alpha=0.5)
    plt.yticks(y_pos, term_freq_df[term_freq_df[kitchen+"_support"]>support].sort_values(by=kitchen+"_coverage", ascending=False)[kitchen+"_coverage"][:50].index,rotation='horizontal')
    plt.ylabel('Token')
    plt.xlabel('Coverage (%)')
    plt.title('Top 50 tokens in cuisine')

In [26]:
interact(f, kitchen={'italian':'italian', 'mexican': 'mexican', 'southern_us': 'southern_us', 'indian': 'indian',\
                     'chinese': 'chinese', 'french':'french', 'cajun_creole':'cajun_creole',  'thai':'thai',\
                     'japanese': 'japanese', 'greek': 'greek', 'spanish': 'spanish', 'korean': 'korean', \
                     'vietnamese': 'vietnamese', 'moroccan':'moroccan', 'british': 'british', \
                     'filipino': 'filipino', 'irish': 'irish', 'jamaican': 'jamaican', 'russian': 'russian', \
                      'brazilian': 'brazilian'}, support=widgets.IntSlider(min=0,max=99,step=1,value=50));

interactive(children=(IntSlider(value=50, description='support', max=99), Dropdown(description='kitchen', opti…

## Select most relevant frequencies

### using fixed support and coverage

d = dict({})
for each in cuisines:
    d[each] = term_freq_df[(term_freq_df[each+"_support"]>90) & (term_freq_df[each+"_coverage"]>0.1)].index.tolist()
d

ingredients_list = term_freq_df[(term_freq_df[each+"_support"]>99) & (term_freq_df[each+"_coverage"]>99)].index.tolist()
for each in cuisines:
    ingredients_list.extend(term_freq_df[(term_freq_df[each+"_support"]>90) & (term_freq_df[each+"_coverage"]>0.1)].index.tolist())

ingredients_list

### using top ingredients (fixed support, top coverage)


d = dict({})
for each in cuisines:
    d[each] = term_freq_df[term_freq_df[each+"_support"]>90].sort_values(by=each+"_coverage", ascending=False)[each+"_coverage"][:5].index.tolist()
d

ingredients_list = term_freq_df[(term_freq_df[each+"_support"]>99) & (term_freq_df[each+"_coverage"]>99)].index.tolist()
for each in cuisines:
    ingredients_list.extend(term_freq_df[term_freq_df[each+"_support"]>90].sort_values(by=each+"_coverage", ascending=False)[each+"_coverage"][:5].index.tolist())

ingredients_list

### using top ingredients (fixed support, top coverage) 2

In [27]:
ingredients_list = term_freq_df[(term_freq_df[each+"_support"]>99) & (term_freq_df[each+"_coverage"]>99)].index.tolist()
for each in cuisines:
    ingredients_list.extend(term_freq_df[term_freq_df[each+"_support"]>90].sort_values(by=each+"_coverage", ascending=False)[each+"_coverage"][:5].index.tolist())
    ingredients_list.extend(term_freq_df[(term_freq_df[each+"_support"]>90) & (term_freq_df[each+"_coverage"]>0.1)].index.tolist())

In [28]:
ingredients_list = list(set(ingredients_list))

In [29]:
ingredients_list

['salsaverde',
 'tomatillos',
 'filepowder',
 'kerrygoldpureirishbutter',
 'instantpuddingmix',
 'kimchi',
 'armagnac',
 'mexicanoregano',
 'jerkseasoning',
 'whitemiso',
 'anchochilepepper',
 'granola',
 'fermentedblackbeans',
 'japaneserice',
 'blackbeans',
 'ai',
 'poblanopeppers',
 'wasabipaste',
 'lowsodiumvegetablejuice',
 'quickcookinggrits',
 'johnsonvillesmokedsausage',
 'masala',
 'harissapaste',
 'greek',
 'rioja',
 'aã',
 'paellarice',
 'merguezsausage',
 'calamansi',
 'ricottacheese',
 'jamaicanjerkseason',
 'pecorinoromanocheese',
 'pompeiancanolaoilandextravirginoliveoil',
 'norisheets',
 'buttermilkbiscuits',
 'gochugaru',
 'blackmustardseeds',
 'gochujangbase',
 'refriedbeans',
 'masaharina',
 'ackee',
 'irishwhiskey',
 'quincepaste',
 'raselhanout',
 'accentseasoning',
 'quesofresco',
 'freshcurryleaves',
 'collards',
 'redrussiankale',
 'tortillachips',
 'brownmustardseeds',
 'cilantroroot',
 'sodabread',
 'vietnamesecoriander',
 'gumbofile',
 'lasagnanoodles',
 'cal

## recode relevant ingredients do dummy vars in training set

In [30]:
def add_var(ingredient_name):
    df[ingredient_name] = 0
    for index, row in df.iterrows():
        if ingredient_name in row["ingredients_clean"]:
            df.at[index,ingredient_name] = 1

In [31]:
for each in ingredients_list:
    print(each)
    add_var(each)

salsaverde
tomatillos
filepowder
kerrygoldpureirishbutter
instantpuddingmix
kimchi
armagnac
mexicanoregano
jerkseasoning
whitemiso
anchochilepepper
granola
fermentedblackbeans
japaneserice
blackbeans
ai
poblanopeppers
wasabipaste
lowsodiumvegetablejuice
quickcookinggrits
johnsonvillesmokedsausage
masala
harissapaste
greek
rioja
aã
paellarice
merguezsausage
calamansi
ricottacheese
jamaicanjerkseason
pecorinoromanocheese
pompeiancanolaoilandextravirginoliveoil
norisheets
buttermilkbiscuits
gochugaru
blackmustardseeds
gochujangbase
refriedbeans
masaharina
ackee
irishwhiskey
quincepaste
raselhanout
accentseasoning
quesofresco
freshcurryleaves
collards
redrussiankale
tortillachips
brownmustardseeds
cilantroroot
sodabread
vietnamesecoriander
gumbofile
lasagnanoodles
calvados
parsleyroot
stiltoncheese
frozenbanana
callaloo
beefdrippings
whitecornmeal
knockwurst
freshmozzarella
driedwoodearmushrooms
dendeoil
beefkidney
dashikombu
driedbonitoflakes
galangal
risingcornmeal
dashi
aipowder
kasha
t

# Model building using K-NN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
np.set_printoptions(precision=2)

feature_names = ingredients_list
X = df[feature_names]
y = df['cuisine']
target_names = cuisines

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.fit_transform(X_test)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

# Predict cuisine on test set using model

In [None]:
df_test = pd.read_json('data/test.json')
df_test.shape
type(df_test)
df_test.info()

In [None]:
df_test.head()

## Clean test data

In [None]:
df_test.set_index("id", inplace=True)

In [None]:
df_test.head()

In [None]:
df_test["ingredients"].head()

In [None]:
df_test["ingredients_clean"] = df_test["ingredients"].astype(str)
df_test["ingredients_clean"] = df_test["ingredients_clean"].str.replace(" ","")
df_test["ingredients_clean"] = df_test["ingredients_clean"].str.replace(","," ")

In [None]:
df_test["ingredients_clean"].head()

In [None]:
df_test["ingredients_clean"] = df_test["ingredients_clean"].str.replace("salt","")

In [None]:
df_test["ingredients_clean"].head()

## recode relevant ingredients do dummy vars in test set

In [None]:
def add_var_test(ingredient_name):
    df_test[ingredient_name] = 0
    for index, row in df_test.iterrows():
        if ingredient_name in row["ingredients_clean"]:
            df_test.at[index,ingredient_name] = 1

In [None]:
for each in ingredients_list:
    print(each)
    add_var_test(each)

## perform the final predictions

In [None]:
df_test["cuisine"] = knn.predict(df_test[feature_names])

In [None]:
df_test["cuisine"].value_counts()

## export predictions to csv

In [None]:
df_test.info()

In [None]:
my_df = df_test.reset_index("id")

In [None]:
my_df = my_df.loc[:,["id","cuisine"]]

In [None]:
my_df.to_csv('Submission1_SvdBroek.csv', index=False, header=True)