In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [6]:
import json
raw = pd.read_json("./train.json")
raw['ingredientsFlat'] = raw.ingredients.apply(lambda x: ' '.join(x))
raw.head()

Unnamed: 0,id,cuisine,ingredients,ingredientsFlat
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...
3,22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...


In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(raw.cuisine.values)
le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype=object)

In [9]:
docs = raw.ingredientsFlat.values
labels_enc = le.transform(raw.cuisine.values)
labels = tf.keras.utils.to_categorical(labels_enc)
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
raw['ingredients_len'] = raw.ingredients.apply(len)

In [16]:
doc_lengths = raw[['ingredients_len']].values

In [17]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

doc_lengths_standardized = ss.fit_transform(doc_lengths)

In [18]:
doc_lengths

array([[ 9],
       [11],
       [12],
       ...,
       [12],
       [21],
       [12]], dtype=int64)

In [20]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

t = tf.keras.preprocessing.text.Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index)+1

encoded_docs = t.texts_to_sequences(docs)

max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

vocab_size

3065

In [21]:
def bootstrap_sample_generator(batch_size):
    while True:
        batch_idx = np.random.choice(
            padded_docs.shape[0], batch_size)
        yield ({'cat_inputs': padded_docs[batch_idx],
                'numeric_inputs': doc_lengths[batch_idx]
               }, 
               {'output': labels[batch_idx] })

In [22]:
def emb_sz_rule(n_cat): 
    return min(600, round(1.6 * n_cat**0.56))

p = .1

In [23]:
cat_inputs = tf.keras.layers.Input((40,), name='cat_inputs')
numeric_inputs = tf.keras.layers.Input((1,), name='numeric_inputs')

In [24]:
embedding_layer = tf.keras.layers.Embedding(
    vocab_size,
    emb_sz_rule(vocab_size),
    input_length=40
)
cat_x = embedding_layer(cat_inputs)

global_ave = tf.keras.layers.GlobalAveragePooling1D()(cat_x)
global_max = tf.keras.layers.GlobalMaxPool1D()(cat_x)
x = tf.keras.layers.Concatenate()([global_ave, global_max])

In [25]:
x = tf.keras.layers.Concatenate()([x, numeric_inputs])

In [26]:
x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Dense(100, activation='relu')(x)

x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Dense(20, activation='relu')(x)

x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Dense(10, activation='relu')(x)

x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(p)(x)
out = tf.keras.layers.Dense(20, activation='softmax', name='output')(x)

In [27]:
model = tf.keras.models.Model(inputs=[cat_inputs, numeric_inputs], outputs=out)
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 cat_inputs (InputLayer)        [(None, 40)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 40, 143)      438295      ['cat_inputs[0][0]']             
                                                                                                  
 global_average_pooling1d (Glob  (None, 143)         0           ['embedding[0][0]']              
 alAveragePooling1D)                                                                              
                                                                                                  
 global_max_pooling1d (GlobalMa  (None, 143)         0           ['embedding[0][0]']          

In [29]:
batch_size = 16

model.fit_generator(
    bootstrap_sample_generator(batch_size),
    steps_per_epoch=10_000 // batch_size,
    epochs=5,
    max_queue_size=10,
)

Epoch 1/5


  model.fit_generator(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e23735b370>