In [2]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB, MultinomialNB
from sklearn.model_selection import train_test_split

train_data = zipfile.ZipFile('train.json.zip','r')
train_data = pd.read_json(train_data.read('train.json'))
test_data = zipfile.ZipFile('test.json.zip','r')
test_data = pd.read_json(test_data.read('test.json'))

In [2]:
categories = {c for cs in train_data['ingredients'] for c in cs}

test_categories = {c for cs in test_data['ingredients'] for c in cs}

df = pd.DataFrame(dtype='u1', index=range(len(train_data['ingredients'])), columns=categories)
df.fillna(0, inplace=True)
df['cuisine'] = train_data['cuisine']

In [5]:
for i, row in train_data['ingredients'].iteritems():
    df.loc[i, row] = 1

dd = df.groupby(['cuisine']).sum()

col_sums = dd.sum()

In [150]:
def learn(learn, test=None):
    cols = learn.drop(['cuisine'], axis=1).columns

    if test is None:
        x_train, test_x, y_train, test_y = train_test_split(learn[cols],
                                                            learn['cuisine'],
                                                            random_state=42,
                                                            stratify=learn['cuisine'])
    else:
        x_train, test_x, y_train, test_y = learn[cols], test[cols], learn['cuisine'], None

    logit = LogisticRegression()
    logit.fit(x_train, y_train)

    y_pred = logit.predict(test_x)
    if test_y is None:
        return y_pred
    
    return logit.predict(x_train), len(y_pred[y_pred == test_y]) / len(test_y)

Unadulterated training data performance:

In [151]:
learn(df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(array(['mexican', 'irish', 'mexican', ..., 'mexican', 'mexican',
        'southern_us'], dtype=object),
 0.7731295253419147)

Can we do better than that?

In [6]:
recipe_complexity = pd.DataFrame({'sum': df[dd.columns].sum(axis=1)})

print(recipe_complexity)

recipe_complexity['cuisine'] = df['cuisine']

len(recipe_complexity[recipe_complexity['sum'] < 20])

        sum
0       9.0
1      11.0
2      12.0
3       4.0
4      20.0
...     ...
39769  12.0
39770   7.0
39771  12.0
39772  21.0
39773  12.0

[39774 rows x 1 columns]


38266

Is there a difference in how short recipes are distributed?

In [7]:
learn(df[recipe_complexity['sum'] < 6])

0.6194503171247357

I wonder if using ingredients as-is is a good idea. Maybe "pork chops" are the same, no matter the brand?

In [8]:
words = {w for ws in train_data['ingredients'] for w in ws for w in w.split()}
len(words) # much fewer traits for sure

3589

How about ingredients that seem to be generic?

In [35]:
worthy_cols = {}
cuisine_groups = {}
for c in col_sums.index:
    cc = dd[c][dd[c] > 0]
    if len(cc) < 2:
        if len(cc) == 1 and cc[0] > 5:
            worthy_cols[c] = 0, 0
        else:
            cuisine_groups[c] = ';'.join(cc.index)
        continue
    s, p = stats.chisquare(cc)
    
    if p > 0.95: # ignore ingredients that are distributed too uniformly across cuisines
        cuisine_groups[c] = ';'.join(cc.index)
        continue
    worthy_cols[c] = s, p
worthy_cols = pd.DataFrame(worthy_cols)

worthy_cols['cuisine'] = 0, 0

In [40]:
len(dd[cuisine_groups.keys()])

20

In [46]:
test_x = df[cuisine_groups.keys()]
cuisine_groups['cuisine'] = ''
test_groups = learn(df[cuisine_groups.keys()], test_x)

test_x = df[worthy_cols.columns].drop('cuisine', axis=1)
test_worthy = learn(df[worthy_cols.columns], test_x)

df['cuisine'][np.logical_and(test_worthy != df['cuisine'], test_groups == df['cuisine'])]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


14           italian
99           italian
231          italian
238          italian
252      southern_us
            ...     
39333        italian
39398    southern_us
39525        italian
39625        mexican
39682    southern_us
Name: cuisine, Length: 745, dtype: object

In [58]:
df[test_worthy != df['cuisine']].groupby('cuisine').count()['stew'].sort_values()

cuisine
jamaican         87
korean           96
moroccan        101
brazilian       115
filipino        146
russian         160
indian          162
thai            170
vietnamese      195
irish           207
chinese         213
greek           227
british         248
japanese        269
cajun_creole    289
mexican         307
spanish         354
italian         502
southern_us     508
french          609
Name: stew, dtype: int64

In [66]:
df[np.logical_and(test_worthy == 'southern_us', test_worthy != df['cuisine'])].groupby('cuisine').count()['stew'].sort_values()

cuisine
vietnamese        4
moroccan          4
thai              9
korean           10
greek            16
indian           20
japanese         21
jamaican         23
brazilian        23
chinese          23
filipino         27
russian          29
spanish          35
irish            68
british          88
italian          98
mexican         101
french          122
cajun_creole    157
Name: stew, dtype: int64

In [53]:
df.groupby('cuisine').count()['stew'].sort_values()

cuisine
brazilian        467
russian          489
jamaican         526
irish            667
filipino         755
british          804
moroccan         821
vietnamese       825
korean           830
spanish          989
greek           1175
japanese        1423
thai            1539
cajun_creole    1546
french          2646
chinese         2673
indian          3003
southern_us     4320
mexican         6438
italian         7838
Name: stew, dtype: int64

In [59]:
(df[test_worthy != df['cuisine']].groupby('cuisine').count()['stew'] / df.groupby('cuisine').count()['stew'].sort_values()).sort_values()

cuisine
mexican         0.047686
indian          0.053946
italian         0.064047
chinese         0.079686
thai            0.110461
korean          0.115663
southern_us     0.117593
moroccan        0.123021
jamaican        0.165399
cajun_creole    0.186934
japanese        0.189037
greek           0.193191
filipino        0.193377
french          0.230159
vietnamese      0.236364
brazilian       0.246253
british         0.308458
irish           0.310345
russian         0.327198
spanish         0.357937
Name: stew, dtype: float64

ok, we know french, southern_us, mexican, italian get confused a lot, especially french (large error) and italian (large population).

Let's slice them off into separate classifiers.


In [146]:
def learn_french(train, test=None):
    cuisine = train['cuisine']
    train = train.drop('cuisine', axis = 1)

    if test is None:
        x_train, test_x, y_train, test_y = train_test_split(train,
                                                            cuisine,
                                                            random_state=42)
    else:
        x_train, test_x, y_train, test_y = train, test, cuisine, None

    y_train_generic = y_train.copy()
    select = np.logical_or(y_train == 'french', y_train == 'italian'), y_train == 'southern_us' #np.logical_or(y_train == 'southern_us', y_train == 'mexican')
    y_train_generic[np.logical_or(*select)] = 'french'
    #y_train_generic[select[0]] = 'french'

    logit_generic = LogisticRegression()
    logit_generic.fit(x_train, y_train_generic)

    print('Done Generic')

    logit_french = LogisticRegression()
    logit_french.fit(x_train[y_train_generic == 'french'], y_train[y_train_generic == 'french'])

    print('Done French')

    def predict(test_x):
        y_pred = logit_generic.predict(test_x)
        y_pred[y_pred == 'french'] = logit_french.predict(test_x[y_pred == 'french'])
        return y_pred

    y_pred = predict(test_x)

    if test_y is None:
        return y_pred

    return predict(train), len(y_pred[y_pred == test_y]) / len(test_y)

In [103]:
def learn_not_french(train, test=None):
    cuisine = train['cuisine']
    train = train.drop('cuisine', axis = 1)

    if test is None:
        x_train, test_x, y_train, test_y = train_test_split(train,
                                                            cuisine,
                                                            random_state=42)
    else:
        x_train, test_x, y_train, test_y = train, test, cuisine, None

    y_train_french = y_train.copy()
    select = np.logical_and(y_train != 'french', y_train != 'mexican'), np.logical_and(y_train != 'southern_us', y_train != 'italian')
    y_train_french[np.logical_and(*select)] = 'not_french'

    logit_french = LogisticRegression()
    logit_french.fit(x_train, y_train_french)

    logit_generic = LogisticRegression()
    logit_generic.fit(x_train[y_train_french == 'not_french'], y_train[y_train_french == 'not_french'])

    def predict(test_x):
        y_pred = logit_french.predict(test_x)
        y_pred[y_pred == 'not_french'] = logit_generic.predict(test_x[y_pred == 'not_french'])
        return y_pred

    y_pred = predict(test_x)

    if test_y is None:
        return y_pred
    
    return predict(train), len(y_pred[y_pred == test_y]) / len(test_y)

In [147]:
test_french, accuracy = learn_french(df[worthy_cols.columns])
accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Done Generic
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Done French


0.7698109412711183

In [137]:
(df[test_french != df['cuisine']].groupby('cuisine').count()['stew'] - df[test_worthy != df['cuisine']].groupby('cuisine').count()['stew']).sort_values()

cuisine
italian           1
brazilian         7
jamaican         15
korean           21
vietnamese       21
japanese         24
filipino         26
moroccan         28
russian          31
chinese          36
spanish          36
greek            45
indian           45
irish            49
cajun_creole     49
thai             53
mexican          62
british          64
french           95
southern_us     133
Name: stew, dtype: int64

In [15]:
df[worthy_cols.columns]

Unnamed: 0,enokitake,frozen corn kernels,part-skim mozzarella,soda,california chile,condensed cream of celery soup,white bread flour,chile paste,basil,extra large shrimp,...,ale,fried eggs,asparagus tips,sofrito,calvados,field peas,fenugreek seeds,short-grain rice,glace cherries,cuisine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,greek
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,southern_us
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,filipino
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,indian
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,indian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,irish
39770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,italian
39771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,irish
39772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chinese


In [133]:
def learn_bayes(learn, test=None):
    cols = learn.drop(['cuisine'], axis=1).columns

    if test is None:
        x_train, test_x, y_train, test_y = train_test_split(learn[cols],
                                                            learn['cuisine'],
                                                            random_state=42)
    else:
        x_train, test_x, y_train, test_y = learn[cols], test[cols], learn['cuisine'], None

    var = x_train.std()
    var = var[var < 1e-15]
    if len(var):
        x_train = x_train.drop(var.index, axis=1)
        test_x = test_x.drop(var.index, axis=1)
    nb = MultinomialNB()
    nb.fit(x_train, y_train)

    y_pred = nb.predict(test_x)
    if test_y is None:
        return y_pred
    
    return nb.predict(x_train), len(y_pred[y_pred == test_y]) / len(test_y)

In [134]:
test_bayes, accuracy = learn_bayes(df)
accuracy

0.7352172164119066

In [20]:
u, s, vh = np.linalg.svd(dd[worthy_cols.columns].drop('cuisine', axis=1))

Can cross-correlation help eliminate "noise"? (reduce overfitting)

In [142]:
corr = dd[col_sums.index].transpose()[['french', 'italian', 'southern_us', 'cajun_creole']].corr()
#for c in col_sums.index:
#    corr.loc[c, c] = 0
corr

cuisine,french,italian,southern_us,cajun_creole
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
french,1.0,0.822349,0.849405,0.697334
italian,0.822349,1.0,0.68426,0.712391
southern_us,0.849405,0.68426,1.0,0.710662
cajun_creole,0.697334,0.712391,0.710662,1.0


In [17]:
uncorr_cols = set(col_sums.index)
for c in col_sums.index:
    excess = corr[c] > 0.9
    for c in excess.index:
        if excess[c] and c in uncorr_cols:
            uncorr_cols.remove(c)
uncorr_cols.add('cuisine')

In [18]:
learn(df[uncorr_cols]) # oops, not a good idea after all

ValueError: at least one array or dtype is required

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

def pipeline(train_x, train_y):
    pipe = Pipeline([('count', CountVectorizer(tokenizer=lambda x: x, lowercase=False)),
                     ('freq', TfidfTransformer()),
                     ('logit', LogisticRegression(solver='newton-cg'))])
    pipe.fit(train_x, train_y)
    return pipe

pipe = pipeline(train_data['ingredients'], train_data['cuisine'])
pipe.score(train_data['ingredients'], train_data['cuisine'])

0.8331321969125559

In [162]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import DBSCAN

def dbscan(train_x):
    pipe = Pipeline([('count', CountVectorizer(tokenizer=lambda x: x, lowercase=False)),
                     ('cosine_distance', FunctionTransformer(cosine_distances)),
                     ('dbscan', DBSCAN(eps=0.3, min_samples=2))])
    return pipe.fit(train_x)

# dbscan(train_data['ingredients']) - this is insane