# Build Classification Models

In [14]:
import pandas as pd 
cuisines_df = pd.read_csv('../data/cleaned_cuisines.csv')
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_feature_df = cuisines_df.drop(['cuisine', 'Unnamed: 0'], axis=1)
print(cuisines_label_df.head())
print(cuisines_feature_df.head())

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object
   almond  angelica  anise  anise_seed  apple  apple_brandy  apricot  \
0       0         0      0           0      0             0        0   
1       1         0      0           0      0             0        0   
2       0         0      0           0      0             0        0   
3       0         0      0           0      0             0        0   
4       0         0      0           0      0             0        0   

   armagnac  artemisia  artichoke  ...  whiskey  white_bread  white_wine  \
0         0          0          0  ...        0            0           0   
1         0          0          0  ...        0            0           0   
2         0          0          0  ...        0            0           0   
3         0          0          0  ...        0            0           0   
4         0          0          0  ...        0            0           0   

   whole_grain_wheat_

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix
from sklearn.svm import SVC
import numpy as np

In [17]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size= 0.2)

In [9]:
lr = LogisticRegression(multi_class='ovr', solver = 'liblinear')
model = lr.fit(X_train, np.ravel(y_train))
accuracy = model.score(X_test, y_test)
print("Accuracy is {}".format(accuracy))

Accuracy is 0.7972465581977471


In [18]:
print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

ingredients: Index(['cayenne', 'chickpea', 'coconut', 'coriander', 'cumin', 'fenugreek',
       'onion', 'pea', 'pepper', 'potato', 'tomato', 'turmeric',
       'vegetable_oil'],
      dtype='object')
cuisine: indian


In [26]:
test = X_test.iloc[50].values.reshape(1, -1)
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data = proba, columns= classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending= [False])
topPrediction.head()

Unnamed: 0,0
indian,0.998178
thai,0.001324
korean,0.000392
chinese,8.6e-05
japanese,1.9e-05


In [27]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     chinese       0.74      0.71      0.72       147
      indian       0.91      0.91      0.91       171
    japanese       0.76      0.78      0.77       166
      korean       0.85      0.78      0.81       149
        thai       0.81      0.89      0.85       166

    accuracy                           0.82       799
   macro avg       0.81      0.81      0.81       799
weighted avg       0.82      0.82      0.82       799

