In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import re
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['sample_submission.csv', 'test.json', 'train.json']


In [23]:
train = pd.read_json('../input/train.json')
test = pd.read_json('../input/test.json')

class_names = list(train.cuisine.unique())
class_names

['greek',
 'southern_us',
 'filipino',
 'indian',
 'jamaican',
 'spanish',
 'italian',
 'mexican',
 'chinese',
 'british',
 'thai',
 'vietnamese',
 'cajun_creole',
 'brazilian',
 'french',
 'japanese',
 'irish',
 'korean',
 'moroccan',
 'russian']

In [24]:
train['num_ings'] = train['ingredients'].apply(lambda x : len(x))
test['num_ings'] = test['ingredients'].apply(lambda x : len(x))
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ings
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [25]:
len(train)

39774

In [26]:
train = train[train['num_ings'] > 2]
len(train)

39559

In [27]:
train['ingredients'] = train['ingredients'].apply(lambda x: list(map(lambda y: y.lower(), x)))
test['ingredients'] = test['ingredients'].apply(lambda x: list(map(lambda y: y.lower(), x)))
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ings
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [28]:
def get_replacements():
    return {'wasabe': 'wasabi', '-': '', 'sauc': 'sauce',
            'baby spinach': 'babyspinach', 'coconut cream': 'coconutcream',
            'coriander seeds': 'corianderseeds', 'corn tortillas': 'corntortillas',
            'cream cheese': 'creamcheese', 'fish sauce': 'fishsauce',
            'purple onion': 'purpleonion','refried beans': 'refriedbeans', 
            'rice cakes': 'ricecakes', 'rice syrup': 'ricesyrup', 
            'sour cream': 'sourcream', 'toasted sesame seeds': 'toastedsesameseeds', 
            'toasted sesame oil': 'toastedsesameoil', 'yellow onion': 'yellowonion'}

In [29]:
lemmatizer = WordNetLemmatizer()
replacements = get_replacements()
stop_pattern = re.compile('[\d’%]')

In [30]:
def tranform_to_single_string(ingredients, lemmatizer, replacements, stop_pattern):
    ingredients_text = ' '.join(iter(ingredients))

    for key, value in replacements.items():
        ingredients_text = ingredients_text.replace(key, value)
    
    words = []
    for word in ingredients_text.split():
        if not stop_pattern.match(word) and len(word) > 2: 
            word = lemmatizer.lemmatize(word)
            words.append(word)
    return ' '.join(words)

In [31]:
transform = lambda ingredients: tranform_to_single_string(ingredients, lemmatizer, replacements, stop_pattern)
train['x'] = train['ingredients'].apply(transform)
test['x'] = test['ingredients'].apply(transform)
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ings,x
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,romaine lettuce black olive grape tomato garli...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,plain flour ground pepper salt tomato ground b...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,egg pepper salt mayonaise cooking oil green ch...
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,water vegetable oil wheat salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,black pepper shallot cornflour cayenne pepper ...


In [32]:
vectorizer = make_pipeline(
        TfidfVectorizer(sublinear_tf=True),
        FunctionTransformer(lambda x: x.astype('float'), validate=False)
    )

In [33]:
x_train = vectorizer.fit_transform(train['x'].values)
x_train.sort_indices()
x_test = vectorizer.transform(test['x'].values)

In [34]:
print(x_train[0])

  (0, 163)	0.2136853139081756
  (0, 221)	0.1470338025940124
  (0, 486)	0.15557274400877014
  (0, 698)	0.34993740346216723
  (0, 902)	0.31820801786364306
  (0, 1024)	0.4063283725642091
  (0, 1029)	0.11001351466168802
  (0, 1104)	0.3579355070161128
  (0, 1446)	0.27871073086742515
  (0, 1777)	0.1399970657392681
  (0, 1896)	0.10425384282532606
  (0, 2061)	0.25027231290252794
  (0, 2174)	0.3586823384903479
  (0, 2275)	0.2407497729524336
  (0, 2626)	0.14860442635641782


In [35]:
def get_estimator():
    return SVC(C=300,
         kernel='rbf',
         gamma=1.5, 
         shrinking=True, 
         tol=0.001, 
         cache_size=1000,
         class_weight=None,
         max_iter=-1, 
         decision_function_shape='ovr',
         random_state=42)

In [None]:
estimator = get_estimator()
y_train = train['cuisine'].values
classifier = OneVsRestClassifier(estimator, n_jobs=-1)
classifier.fit(x_train, y_train)

In [None]:
test['cuisine']  = classifier.predict(x_test)
test[['id', 'cuisine']].to_csv('submission.csv', index=False)