In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

df_train = pd.read_json('train.json')
df_test = pd.read_json('test.json')

num_cuisines = df_train.cuisine.unique().shape[0] #Counting down number of crusines
#num_cuisines


def vect_train_test(dftrain, dftest, n_words=1000, words=None, vect=None):
    if vect == 'tfid':
        vectorizer = TfidfVectorizer(max_features = n_words)
    else:
        vectorizer = CountVectorizer(max_features = n_words)
    ingredients_train = dftrain.ingredients
    words_train = [' '.join(x) for x in ingredients_train]
    ingredients_test = dftest.ingredients
    words_test = [' '.join(x) for x in ingredients_test]
    if isinstance(words, pd.Series):
        bag_of_words = vectorizer.fit(words)
    else:
        bag_of_words = vectorizer.fit(words_train)

    ing_array_train = bag_of_words.transform(words_train).toarray()
    ing_array_test = bag_of_words.transform(words_test).toarray()
    
    df_ing_train = pd.DataFrame(ing_array_train, columns=vectorizer.vocabulary_)
    df_ing_test = pd.DataFrame(ing_array_test, columns=vectorizer.vocabulary_)
    #-----
    df_train_temp = dftrain.merge(df_ing_train, left_index=True, right_index=True).drop('ingredients', axis=1)
    print(df_train_temp)
    df_test_temp = dftest.merge(df_ing_test, left_index=True, right_index=True).drop('ingredients', axis=1)
    print(df_test_temp)
    #-----
    return df_train_temp, df_test_temp
  
  
from tensorflow.python import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Softmax
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.activations import relu
from tensorflow.python.client import device_lib

df_train_new, df_test_new = vect_train_test(df_train, df_test, n_words=1000)
print(df_train_new)
print(df_test_new)
mlb = MultiLabelBinarizer()

X1 = np.array(df_train_new.drop(['id', 'cuisine'], axis=1))
cuisine_vector = [[c] for c in df_train_new.cuisine]
y1 = mlb.fit_transform(cuisine_vector)
X_train, X_val, y_train, y_val = train_test_split(X1, y1, train_size=0.85)

df_train_new, df_test_new = vect_train_test(df_train, df_test, n_words=1000, vect='tfid')
X2 = np.array(df_train_new.drop(['id', 'cuisine'], axis=1))
cuisine_vector = [[c] for c in df_train_new.cuisine]
y2 = mlb.fit_transform(cuisine_vector)
X_train, X_val, y_train, y_val = train_test_split(X2, y2, train_size=0.85)


model = Sequential([
    Dense(500, input_dim=1000, activation='relu'),
    Dropout(0.15),
    Dense(250, input_dim=1000, activation='relu'),
    Dense(20, activation='softmax')])

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=Adam(lr=0.002))

model.fit(X_train, y_train, epochs=5, verbose=1, validation_data=(X_val, y_val))

y_pred = model.predict(df_test_new.drop('id', axis=1))
y_cat = mlb.classes_[y_pred.argmax(axis=1)]

df_sub = pd.DataFrame(np.array([df_test.id, y_cat]).T, columns=['id', 'cuisine']).set_index('id')

print(df_sub)

            cuisine     id  romaine  ...  fajita  chayotes  american
0             greek  10259        0  ...       0         0         0
1       southern_us  25693        0  ...       0         0         0
2          filipino  20130        0  ...       0         0         0
3            indian  22213        0  ...       0         0         0
4            indian  13162        0  ...       0         0         0
5          jamaican   6602        0  ...       0         0         0
6           spanish  42779        0  ...       0         0         0
7           italian   3735        0  ...       0         0         0
8           mexican  16903        0  ...       0         0         0
9           italian  12734        0  ...       0         0         0
10          italian   5875        0  ...       0         0         0
11          chinese  45887        0  ...       0         0         0
12          italian   2698        0  ...       0         0         0
13          mexican  41995        