In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
from collections import Counter
import unidecode

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn import model_selection 

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
'''function to generate random colors for visualization'''
def random_colors(num_of_colors):
    colors = []
    for i in range(num_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [None]:
train = pd.read_json("/kaggle/input/whats-cooking-kernels-only/train.json")
test = pd.read_json("/kaggle/input/whats-cooking-kernels-only/test.json")

In [None]:
train.head()

In [None]:
train['cuisine'].unique()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train['cuisine'].value_counts()

In [None]:
train['cuisine'].value_counts().plot.bar(color=random_colors(len(train['cuisine'].value_counts())), figsize=(16,6))

In [None]:
train['ing_count'] = train['ingredients'].apply(lambda x : len(x))
train.head()

In [None]:
raw_ingredients = [j for ing in train['ingredients'] for j in ing ]
len(raw_ingredients)

In [None]:
print('Maximum Number of Ingredients in a Dish: ',train['ingredients'].str.len().max())
print('Minimum Number of Ingredients in a Dish: ',train['ingredients'].str.len().min())

In [None]:
plt.figure(figsize=(16,6))
sns.distplot(train['ing_count'], bins=60)

**Number of long recipes**

In [None]:
len(train[train['ing_count']>35])

**Number of small recipes**

In [None]:
len(train[train['ing_count']<=2])

In [None]:
plt.figure(figsize=(16,6))
sns.boxplot(x='cuisine', y='ing_count', data=train)

In [None]:
[ing for ing in raw_ingredients if len(ing)<=2]

In [None]:
'''check if anything other than alphabets are present'''
' '.join(sorted([char for char in set(''.join(raw_ingredients)) if re.findall('[^A-Za-z]', char)]))

In [None]:
list(set([ing for ing in raw_ingredients if re.findall('-', ing)]))[:5]

In [None]:
list(set([ing for ing in raw_ingredients if re.findall('[0-9]', ing)]))[:5]

In [None]:
units = ['inch ', 'oz.', ' lb.', 'ounc', '%', 'oz ']
list(set([ing for ing in raw_ingredients if any(unit in ing for unit in units)]))

In [None]:
top_ing = Counter([x for ing in train['ingredients'] for x in ing])
top_ing.most_common(20)

In [None]:
temp = pd.DataFrame(top_ing.most_common(20),columns=['Ingredients', 'Count'])
plt.figure(figsize=(28,6))
sns.barplot(x='Ingredients', y='Count', data=temp)

In [None]:
lemmatizer = WordNetLemmatizer()
def preprocess_ing(ingredients):
    ing_string = ' '.join(ingredients)
    ing_string = ing_string.lower()
    ing_string = ing_string.replace('-',' ')
    words = []
    for word in ing_string.split():
        word = re.sub(r'\d+\.\d+'," ", word)
        word = re.sub(r"\b(oz.|lb.|ounc|inch|%)\b"," ",word)
        if len(word)<=2:
            continue
        word = unidecode.unidecode(word)
        word = lemmatizer.lemmatize(word)
        if len(word)>0:
            words.append(word)
    return ' '.join(words)

In [None]:
for ingredient, expected in [
    ('Eggs', 'egg'),
    ('all-purpose flour', 'all purpose flour'),
    ('purée', 'puree'),
    ('1% low-fat milk', 'low fat milk'),
    ('half & half', 'half half'),
    ('safetida (powder)', 'safetida (powder)')
]:
    actual = preprocess_ing([ingredient])
    print(actual)

In [None]:
train['x'] = train['ingredients'].apply(lambda y : preprocess_ing(y))
test['x'] = test['ingredients'].apply(lambda y : preprocess_ing(y))
train.head()

In [None]:
raw_ingredients = [j for ing in train['x'] for j in ing ]
len(raw_ingredients)

In [None]:
units = ['inch ', 'oz.', ' lb.', 'ounc', '%', 'oz ']
list(set([ing for ing in raw_ingredients if any(unit in ing for unit in units)]))

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True)

In [None]:
X_train = vectorizer.fit_transform(train['x'].values)
X_train.sort_indices()
X_test = vectorizer.transform(test['x'].values)

In [None]:
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(train['cuisine'].values)

In [None]:
X, X_val, Y, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)

In [None]:
classifier = SVC(kernel='rbf',random_state=0)

In [None]:
model = OneVsRestClassifier(classifier, n_jobs=4)
model.fit(X, Y)

In [None]:
print ("Predict on test data ... ")
Y_test = model.predict(X_val)
Y_pred = label_encoder.inverse_transform(Y_test)

In [None]:
print("Accuracy Score:", accuracy_score(y_val, Y_test))

In [None]:
Y_pred[:20]

In [None]:
y_test = model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_test)

In [None]:
test_id = test['id']
sub = pd.DataFrame({'id': test_id, 'cuisine': y_pred}, columns=['id', 'cuisine'])
sub.to_csv('submission.csv', index=False)