### Recipe Classification using simple CNN and RNN

In [1]:
import numpy as np
import pandas as pd
import json
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Load the Data

In [2]:
recipeRaw = pd.read_json("/home/nbuser/challanges/08-recipe-ingredients/train.json")
recipeRaw["ingredientsFlat"] = recipeRaw["ingredients"].apply(lambda x: ' '.join(x))
traindocs = recipeRaw["ingredientsFlat"].values

recipeRawTest = pd.read_json("/home/nbuser/challanges/08-recipe-ingredients/test.json")
recipeRawTest["ingredientsFlat"] = recipeRawTest["ingredients"].apply(lambda x: ' '.join(x))
testdocs = recipeRawTest["ingredientsFlat"].values

print("No. of Recipies for Training: ", len(traindocs))
print("No. of Recipies for Testing: ", len(testdocs))

No. of Recipies for Training:  39774
No. of Recipies for Testing:  9944


In [3]:
# traindocs[2]

In [4]:
df = pd.read_json("/home/nbuser/challanges/08-recipe-ingredients/train.json")['ingredients']
print('average recipe length',max( map(len, df) ))
print('average recipe length',min( map(len, df) ))

average recipe length 65
average recipe length 1


In [5]:
recipeRaw["cuisine"].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [6]:
leb = preprocessing.LabelEncoder()
leb.fit(recipeRaw["cuisine"].values)

labels_enc = leb.transform(recipeRaw["cuisine"].values)
labels = to_categorical(labels_enc)

### Tokenizing the text (ingredients) of the recipes

In [7]:
t = Tokenizer()
t.fit_on_texts(traindocs)
vocab_size = len(t.word_index) + 1

encoded_train_docs = t.texts_to_sequences(traindocs)
encoded_test_docs = t.texts_to_sequences(testdocs)
print("Total number of Unique Ingredints:",vocab_size)

max_length = 65
padded_train_docs = pad_sequences(encoded_train_docs, maxlen=max_length, padding='post')
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')

Total number of Unique Ingredints: 3065


In [8]:
# padded_train_docs[2]

### Parsing Glove word-embeddings file

In [9]:
embeddings_index = dict()
f = open('/home/nbuser/NLP/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [10]:
vocab = pd.DataFrame.from_dict(t.word_index,orient="index")
vocab.drop([0],axis=1).reset_index().rename(columns={"index":"word"}).to_csv("vocab.csv",index=False)

### Preparing Glove Word-embedding matrix

In [11]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Build Simple CNN-model

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import KFold

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

cvscores = []
for train, valid in kfold.split(padded_train_docs, labels):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=65, trainable=False))
    model.add(Conv1D(filters=100, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(264, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(20, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

    if cvscores == []:
        print(model.summary())

    model.fit(padded_train_docs[train], labels[train], epochs=10, verbose=0)
    scores = model.evaluate(padded_train_docs[valid], labels[valid], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 65, 100)           306500    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 65, 100)           30100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 100)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 264)               845064    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               33920     
_________________________________________________________________
dense_3 (Dense)              (None, 20)                2580      
Total para

### Test Prediction

In [14]:
predictions = model.predict(padded_test_docs)
recipeRawTest["cuisine"] = [leb.classes_[np.argmax(prediction)] for prediction in predictions]
recipeRawTest.head()
recipeRawTest.drop(["ingredients","ingredientsFlat"],axis=1).to_csv("predictRecipe-CNN.csv",index=False)

### Simple RNN-model

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from sklearn.model_selection import KFold
from keras.layers import Embedding, SimpleRNN

seed = 42
np.random.seed(seed)

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

cvscores = []
for train, valid in kfold.split(padded_train_docs, labels):
    model = Sequential()
    model.add(Embedding(40, 32))
    model.add(SimpleRNN(32))
    model.add(Dense(20, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    if cvscores == []:
        print(model.summary())

    model.fit(padded_train_docs[train], labels[train], epochs=5, verbose=0)
    scores = model.evaluate(padded_train_docs[valid], labels[valid], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 32)          1280      
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_19 (Dense)             (None, 20)                660       
Total params: 4,020
Trainable params: 4,020
Non-trainable params: 0
_________________________________________________________________
None
acc: 21.33%
acc: 15.16%
acc: 19.45%
acc: 20.24%
acc: 19.69%
19.17% (+/- 2.11%)
