In [74]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [75]:
# Load train data
df = pd.read_json('../data/raw/train.json')

# Load test data
df_t = pd.read_json('../data/raw/test.json')

In [76]:
df['ingredients'][0]

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles']

In [77]:
#String each ilist of ingredient
def ingre_strip(ingre):
    for i in range(len(ingre)):
        ingre[i] = ingre[i].replace(" ", "")
        ingre[i] = ingre[i].replace("-", "")
    strip_list = ' '.join(ingre)
    strip_list = re.sub(r'\d+', '', strip_list)
    strip_list = strip_list.replace('%','')
    strip_list = strip_list.split(' ')
    strip_list = list(map(lambda word: WordNetLemmatizer().lemmatize(word.lower()),strip_list))
    return ' '.join(strip_list)

ingre_new = df['ingredients'].apply(ingre_strip)
df['ingredients'] = ingre_new
df.head()

ingre_new = df_t['ingredients'].apply(ingre_strip)
df_t['ingredients'] = ingre_new
df_t.head()
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,romainelettuce blackolives grapetomatoes garli...
1,southern_us,25693,plainflour groundpepper salt tomato groundblac...
2,filipino,20130,egg pepper salt mayonaise cookingoil greenchil...
3,indian,22213,water vegetableoil wheat salt
4,indian,13162,blackpepper shallot cornflour cayennepepper on...


In [78]:
X = df
y = df['cuisine']


In [79]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [80]:
del X_test['cuisine']

In [81]:
X_test = X_test['ingredients']

In [82]:
X_test = np.array(X_test)
X_test

array(['greenchile pepper bonelesschickenbreast chilipowder reducedfatsourcream redbellpepper groundcumin whiteonion garlicpowder jalapenochilies tomatillo salt driedoregano chickenbroth whitecannellinibeans mexicancheeseblend flour dicedtomatoes enchiladasauce chorizosausage whitecorn freshcilantro flourtortillas parsley garlic oregano',
       'parsleysprigs radish seasalt pozole chickenstock whiteonion tomatillo garliccloves canolaoil bonelessporkshoulder pork shreddedcabbage romainelettuceleaves driedoregano serranochilies lime epazote greenpumpkinseeds',
       'honey garliccloves strongwhitebreadflour vegetableoil yoghurt yeast water salt',
       ...,
       'sesameseeds garlic cucumber sugar greenonions ricevinegar hardboiledegg buckwheatnoodles toastedsesameoil soysauce redpepper gochujangbase',
       'chickenbroth pepper sesameoil frozenpeas soysauce bonelessskinlesschickenbreasts salt egg garlicpowder ginger whiteonion chilipowder cookedwhiterice',
       'marinarasauce fre

In [83]:
y_test = np.array(y_test)
y_test

array(['mexican', 'mexican', 'indian', ..., 'korean', 'chinese',
       'italian'], dtype=object)

In [84]:
X_train = X_train["ingredients"]
X_train = np.array(X_train)
X_train

array(['blackbeans russetpotatoes greenonions redenchiladasauce jalapenochilies frozencornkernels cheddarcheese vegetableoil',
       'freshcilantro shallot garliccloves chickenbroth oliveoil salt groundturkey lime purpleonion freshmint lemongrass freshginger freshlygroundpepper asianfishsauce',
       'baguette cucumber mayonaise salt choppedcilantrofresh pepper porkloinchops chilesauce purpleonion freshlimejuice',
       ...,
       'greenbellpepper broccoliflorets shrimp tomato sundriedtomatoes purpleonion driedoregano oliveoil garlic freshparsley freshbasil artichokehearts pennepasta',
       'lowsodiumtomatopaste italianseasoning tomatosauce leangroundbeef oliveoil shreddedmozzarellacheese frozenchoppedspinach refrigeratedpizzadough',
       'buttermilk allpurposeflour bacon frozenwholekernelcorn'],
      dtype=object)

In [85]:
y_train = np.array(y_train)
y_train

array(['mexican', 'vietnamese', 'vietnamese', ..., 'italian', 'italian',
       'southern_us'], dtype=object)

In [89]:
classifier = Pipeline([
    ('vectorizer', HashingVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsOneClassifier(LinearSVC(C=500, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=1, tol=1e-05, verbose=0)))])
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_train)
predicted

array(['mexican', 'vietnamese', 'vietnamese', ..., 'italian', 'italian',
       'southern_us'], dtype=object)

In [90]:
print(accuracy_score(y_train, predicted))
print(f1_score(y_train, predicted, average='weighted'))
print(precision_score(y_train, predicted,average='weighted'))
#print(classification_report(y_train, predicted))
#print(confusion_matrix(y_train, predicted))

0.9946258524780791
0.9946175943531963
0.9946141921090098


In [91]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsOneClassifier(LinearSVC(C=500, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=1, tol=1e-05, verbose=0)))])
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
predicted

array(['mexican', 'mexican', 'indian', ..., 'korean', 'chinese',
       'italian'], dtype=object)

In [92]:
print(accuracy_score(y_test, predicted))
print(f1_score(y_test, predicted, average='weighted'))
print(precision_score(y_test, predicted,average='weighted'))
print(hamming_loss(y_test, predicted))

0.7367693274670019
0.7343482762383352
0.7346694614267667
0.2632306725329981


In [43]:
predicted

array(['mexican', 'mexican', 'indian', ..., 'korean', 'chinese',
       'italian'], dtype=object)