# My own model for Kaggle: What's cooking (SamLi)

In [1]:
import pandas as pd
import numpy as np

## 1: Read the data & try to find some features without using CountVectorizer firstly

In [2]:
cooking = pd.read_json("C:/Users/lijin/Desktop/ML-text-main/data/train.json")

In [3]:
cooking.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


- Literally, we can find there are 3 types of column
  - for prediction, it's easily to find "id" column is useless
  - cuisine is the target output type
  - ingredients column is what we need
    - However, the question now is how to use the ingredients column to create some useful features?

In [4]:
#what about the length of the ingredients column
cooking.ingredients.apply(len) #即观察有多少种原材料

0         9
1        11
2        12
3         4
4        20
         ..
39769    12
39770     7
39771    12
39772    21
39773    12
Name: ingredients, Length: 39774, dtype: int64

In [5]:
#what about the ingredients_length?
cooking.ingredients.apply(lambda x: np.mean([len(item) for item in x])) #这里所求的是平均原材料所包含的英文字母个数的长度（包括空格）

0        12.000000
1        10.090909
2        10.333333
3         6.750000
4        10.100000
           ...    
39769    12.166667
39770    17.000000
39771     8.250000
39772    13.142857
39773    12.000000
Name: ingredients, Length: 39774, dtype: float64

In [6]:
#what about the ingredients_length?
cooking.ingredients.apply(lambda x: np.sum([len(item) for item in x])) #这里所求的是原材料所包含的英文字母个数的总长度

0        108
1        111
2        124
3         27
4        202
        ... 
39769    146
39770    119
39771     99
39772    276
39773    144
Name: ingredients, Length: 39774, dtype: int32

In [7]:
#当然，为了用CountVectorizer, 我们还应该将list类型的ingredients列中的值，转化为string类型
print(type(cooking.ingredients[1]))
print(type(cooking.ingredients.astype(str)[1]))

<class 'list'>
<class 'str'>


- 我们创建4列以包含我们上述所需要的四个新列
  - 这里使用定义函数的方法（为了使读入test_data时不再写四遍定义参数）

In [8]:
def make_features(df):
    df['num_ingredients'] = df.ingredients.apply(len) #长度
    df['ingredients_length_mean'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    df['ingredients_length_sum'] = df.ingredients.apply(lambda x: np.sum([len(item) for item in x])) 
    df['ingredients_str'] = df.ingredients.astype(str) #preprcessing of CountVectorizer()
    
    return df

In [9]:
#在读入文件的过程中，直接把四列创建完成
train = make_features(pd.read_json("C:/Users/lijin/Desktop/ML-text-main/data/train.json"))
train.head()

Unnamed: 0,id,cuisine,ingredients,num_ingredients,ingredients_length_mean,ingredients_length_sum,ingredients_str
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,108,"['romaine lettuce', 'black olives', 'grape tom..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,111,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,124,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,22213,indian,"[water, vegetable oil, wheat, salt]",4,6.75,27,"['water', 'vegetable oil', 'wheat', 'salt']"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,202,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [10]:
y = train.cuisine
y

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression #为了提高准确率，这里直接上svm分类器

In [12]:
vect = CountVectorizer(token_pattern=r"'([a-z ]+)'") #token_pattern=... 表明CountVectorizer如何提取字符，用正则形式表达出来。
lg = LogisticRegression()

- 这里表明当出现如“romaine lettuce”类似这样的ingredients type时，提取“romaine lettuce”为一整体token，而非"romaine"一个,"lettuce"一个

In [13]:
#读入待预测数据
new = make_features(pd.read_json("C:/Users/lijin/Desktop/ML-text-main/data/test.json"))

In [14]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredients_length_mean,ingredients_length_sum,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,56,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,113,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,58,"['sausage links', 'fennel bulb', 'fronds', 'ol..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,252,"['meat cuts', 'file powder', 'smoked sausage',..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,104,"['ground black pepper', 'salt', 'sausage casin..."


## 2: Create a pipeline

In [15]:
#这步后来我发现有点多余了，之前定义过一遍y了orz，写的不好
X = train.ingredients_str
y = train.cuisine

In [16]:
from sklearn.pipeline import make_pipeline

In [17]:
pipe = make_pipeline(vect, lg)

In [18]:
pipe.steps

[('countvectorizer', CountVectorizer(token_pattern="'([a-z ]+)'")),
 ('logisticregression', LogisticRegression())]

### 先用 cross_validation 测试一下

In [19]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7743250623833745

- lg表现还是不错的，牛！

## 3: 把我们第一步找的那四个变量加到pipeline里面

In [20]:
from sklearn.compose import make_column_transformer

In [21]:
# vectorize 1 column, passthrough 3 columns, and drop the remaining columns
ct = make_column_transformer(
    (vect, 'ingredients_str'), #这列要使用CountVectorizer()
    ('passthrough', ['num_ingredients','ingredients_length_mean','ingredients_length_sum']), #这些列直接保留就好
    remainder = 'drop' #其它列删除
    )

In [22]:
# create the FEATURE MATRIX from the DataFrame
X_dtm_manual = ct.fit_transform(train)
X_dtm_manual.shape

(39774, 6253)

### 在此基础上，尝试cross_validation

In [27]:
pipe = make_pipeline(ct, lg)

In [28]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern="'([a-z ]+)'"),
                                   'ingredients_str'),
                                  ('passthrough', 'passthrough',
                                   ['num_ingredients', 'ingredients_length_mean',
                                    'ingredients_length_sum'])])),
 ('logisticregression', LogisticRegression())]

- 纪念一下，我nm第一次写成pipe = (ct,svc) 半天愣是没发现哪里有错orz

In [29]:
import warnings
warnings.filterwarnings("ignore") #csdn说上述警告可以直接忽略，我也就直接忽略了，6
cross_val_score(pipe, train, y, cv=5, scoring='accuracy').mean() #注意，这里是train，而非X

0.4001614310569875

- 啊，发生甚么事了，准确度骤然下降

In [31]:
#尝试保留部分特征
ct = make_column_transformer(
    (vect, 'ingredients_str'), #这列要使用CountVectorizer()
    ('passthrough', ['num_ingredients','ingredients_length_mean']), #这些列直接保留就好
    remainder = 'drop' #其它列删除
    )

In [32]:
X_dtm_manual = ct.fit_transform(train)
X_dtm_manual.shape

(39774, 6252)

In [34]:
pipe = make_pipeline(ct, lg)
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern="'([a-z ]+)'"),
                                   'ingredients_str'),
                                  ('passthrough', 'passthrough',
                                   ['num_ingredients',
                                    'ingredients_length_mean'])])),
 ('logisticregression', LogisticRegression())]

In [35]:
import warnings
warnings.filterwarnings("ignore")
cross_val_score(pipe, train, y, cv=5, scoring='accuracy').mean() #注意，这里是train，而非X

0.6943732590617293

- 好些了，那我们就把'ingredients_length_sum'这个features删了，不要了

In [37]:
#再尝试保留部分特征
ct = make_column_transformer(
    (vect, 'ingredients_str'), #这列要使用CountVectorizer()
    ('passthrough', ['num_ingredients']), #这些列直接保留就好
    remainder = 'drop' #其它列删除
    )

In [38]:
pipe = make_pipeline(ct, lg)
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern="'([a-z ]+)'"),
                                   'ingredients_str'),
                                  ('passthrough', 'passthrough',
                                   ['num_ingredients'])])),
 ('logisticregression', LogisticRegression())]

In [39]:
X_dtm_manual = ct.fit_transform(train)
X_dtm_manual.shape

(39774, 6251)

In [40]:
import warnings
warnings.filterwarnings("ignore")
cross_val_score(pipe, train, y, cv=5, scoring='accuracy').mean() #注意，这里是train，而非X

0.7348521155664555

- 我放弃抵抗了，第一步找的3个其它特征全是“飞舞”，直接就用vect + lg就行了，也许在别的分类器下这3个特征还是有可能有用的。我估计美赛当时就是这么寄的

In [41]:
# vectorize 1 column, passthrough 3 columns, and drop the remaining columns
ct = make_column_transformer(
    (vect, 'ingredients_str'),
    remainder = 'drop'
    )

In [42]:
X_dtm_manual = ct.fit_transform(train)
X_dtm_manual.shape

(39774, 6250)

In [43]:
pipe = make_pipeline(ct, lg)
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern="'([a-z ]+)'"),
                                   'ingredients_str')])),
 ('logisticregression', LogisticRegression())]

In [44]:
import warnings
warnings.filterwarnings("ignore")
cross_val_score(pipe, train, y, cv=5, scoring='accuracy').mean() #注意，这里是train，而非X

0.7743250623833745

### GridSearchCV to find the best params

In [45]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern="'([a-z ]+)'"),
                                   'ingredients_str')])),
 ('logisticregression', LogisticRegression())]

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
param_grid = {}
param_grid['columntransformer__countvectorizer__token_pattern'] = [r'\b\w\w+\b',r"'([a-z ]+)'"] 
param_grid['logisticregression__solver'] = ['liblinear','newton-cg','lbfgs','sag']
param_grid['logisticregression__C'] = [0.5,1]
param_grid

{'columntransformer__countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
  "'([a-z ]+)'"],
 'logisticregression__solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag'],
 'logisticregression__C': [0.5, 1]}

In [50]:
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

In [51]:
%time grid.fit(train, y)

Wall time: 10min 17s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('countvectorizer',
                                                                         CountVectorizer(token_pattern="'([a-z "
                                                                                                       "]+)'"),
                                                                         'ingredients_str')])),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'columntransformer__countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
                                                                               "'([a-z "
                                                                               "]+)'"],
                         'logisticregression__C': [0.5, 1],
                         'logisticregression__s

In [52]:
print(grid.best_score_)
print(grid.best_params_)

0.7859909722892805
{'columntransformer__countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'logisticregression__C': 1, 'logisticregression__solver': 'liblinear'}


- 出乎意料的结果

### RandomizedSearchCV to find the best params
  - 因为C可以为大于0的浮点数，所以尝试使用RandomizedSearchCV

In [53]:
from sklearn.model_selection import RandomizedSearchCV

In [68]:
import scipy as sp
param_grid = {}
#这里就不调solver参数了，多加了个CountVectorizer()中的参数，min_df
param_grid['columntransformer__countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['columntransformer__countvectorizer__min_df'] = [1, 2, 3]
param_grid['logisticregression__C'] = sp.stats.uniform(scale=1)
param_grid

{'columntransformer__countvectorizer__token_pattern': ['\\b\\w\\w+\\b',
  "'([a-z ]+)'"],
 'columntransformer__countvectorizer__min_df': [1, 2, 3],
 'logisticregression__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x2c4c3cd3730>}

- scipy.stats.uniform(scale=1) 是 Scipy 中用于表示均匀分布的对象，它表示在指定的区间内所有数值出现的概率相等的概率分布。该函数的参数 scale 表示分布的区间长度，即分布的上限和下限之差。

In [69]:
np.random.seed(1)

In [70]:
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [71]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern="'([a-z ]+)'"),
                                   'ingredients_str')])),
 ('logisticregression', LogisticRegression())]

In [72]:
%time rand.fit(train, y)

Wall time: 2min 42s


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('countvectorizer',
                                                                               CountVectorizer(token_pattern="'([a-z "
                                                                                                             "]+)'"),
                                                                               'ingredients_str')])),
                                             ('logisticregression',
                                              LogisticRegression())]),
                   n_iter=5,
                   param_distributions={'columntransformer__countvectorizer__min_df': [1,
                                                                                       2,
                                                                                       3],
                                

In [73]:
results = pd.DataFrame(rand.cv_results_)
results[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.772867,0.004583,{'columntransformer__countvectorizer__min_df':...
1,0.77337,0.004474,{'columntransformer__countvectorizer__min_df':...
2,0.783125,0.004872,{'columntransformer__countvectorizer__min_df':...
3,0.782722,0.004799,{'columntransformer__countvectorizer__min_df':...
4,0.782974,0.004323,{'columntransformer__countvectorizer__min_df':...


In [74]:
print(rand.best_score_)
print(rand.best_params_)

0.7831247365627025
{'columntransformer__countvectorizer__min_df': 1, 'columntransformer__countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'logisticregression__C': 0.3965807272960261}


- emmmm准确率不如gridsearch中的params，所以暂且就用gridsearch中的参数作为最终结果吧

## 4：Making new predictions!

In [75]:
X_new = new

In [76]:
grid.best_estimator_ 

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('countvectorizer',
                                                  CountVectorizer(token_pattern='\\b\\w\\w+\\b'),
                                                  'ingredients_str')])),
                ('logisticregression',
                 LogisticRegression(C=1, solver='liblinear'))])

In [77]:
new_pred_class_rand = grid.predict(X_new)

In [78]:
pd.DataFrame({'id':new.id,'cuisine':new_pred_class_rand}).set_index('id').to_csv('sub_samli_01.csv')