In [1]:
import pandas as pd
import numpy as np

## 1) Try to create an Ensembling model
  - KNN + LogisticRegression 

In [2]:
def make_features(df):
    df['num_ingredients'] = df.ingredients.apply(len) #长度
    df['ingredients_length_mean'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    df['ingredients_length_sum'] = df.ingredients.apply(lambda x: np.sum([len(item) for item in x])) 
    df['ingredients_str'] = df.ingredients.astype(str)
    
    return df

In [3]:
train = make_features(pd.read_json("C:/Users/lijin/Desktop/ML-text-main/data/train.json"))
train.head()

Unnamed: 0,id,cuisine,ingredients,num_ingredients,ingredients_length_mean,ingredients_length_sum,ingredients_str
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,108,"['romaine lettuce', 'black olives', 'grape tom..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,111,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,124,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,22213,indian,"[water, vegetable oil, wheat, salt]",4,6.75,27,"['water', 'vegetable oil', 'wheat', 'salt']"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,202,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [4]:
y = train.cuisine

In [5]:
new = make_features(pd.read_json("C:/Users/lijin/Desktop/ML-text-main/data/test.json"))
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredients_length_mean,ingredients_length_sum,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,56,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,113,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,58,"['sausage links', 'fennel bulb', 'fronds', 'ol..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,252,"['meat cuts', 'file powder', 'smoked sausage',..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,104,"['ground black pepper', 'salt', 'sausage casin..."


- Use 3 columns for KNN as part 1 of ensembling model and (CountVectorizer + LogisticRegression) as part 2 of ensembling model

### model 1: KNN by using the self-created features

In [6]:
#引入通过ver1 找到的最好的参数
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vect = CountVectorizer(token_pattern = r"\b\w\w+\b")

In [8]:
ct = make_column_transformer(
    (vect, 'ingredients_str'), #这列要使用CountVectorizer()
    remainder = 'drop' #其它列删除
    )

In [9]:
lg = LogisticRegression(C=1,solver='liblinear')

## 2) Try to create the Ensembling model (VotingClassifier)

### model 1: svm.svc;model 2:random forest;model 3:logistic_regression

In [10]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [11]:
svc = SVC(kernel='rbf', C=1.0, gamma='auto',probability=True)
rfc = RandomForestClassifier(max_features=None)

In [12]:
from sklearn.ensemble import VotingClassifier
#vc = VotingClassifier([('clf1', svc),('clf2', rfc),('clf3', lg)])
vc = VotingClassifier([('clf1', svc),('clf2', rfc),('clf3', lg)],voting='soft',weights=(1,1,2))

In [13]:
pipe = make_pipeline(ct,vc)
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('countvectorizer',
                                   CountVectorizer(token_pattern='\\b\\w\\w+\\b'),
                                   'ingredients_str')])),
 ('votingclassifier',
  VotingClassifier(estimators=[('clf1', SVC(gamma='auto', probability=True)),
                               ('clf2',
                                RandomForestClassifier(max_features=None)),
                               ('clf3',
                                LogisticRegression(C=1, solver='liblinear'))],
                   voting='soft', weights=(1, 1, 2)))]

In [14]:
%time pipe.fit(train,y)

Wall time: 59min 38s


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('countvectorizer',
                                                  CountVectorizer(token_pattern='\\b\\w\\w+\\b'),
                                                  'ingredients_str')])),
                ('votingclassifier',
                 VotingClassifier(estimators=[('clf1',
                                               SVC(gamma='auto',
                                                   probability=True)),
                                              ('clf2',
                                               RandomForestClassifier(max_features=None)),
                                              ('clf3',
                                               LogisticRegression(C=1,
                                                                  solver='liblinear'))],
                                  voting='soft', weights=(1, 1, 2)))])

In [19]:
X_new = new
X_new_predict = pipe.predict(X_new)

In [20]:
X_new_predict

array(['southern_us', 'southern_us', 'italian', ..., 'italian',
       'southern_us', 'mexican'], dtype=object)

In [21]:
pd.DataFrame({'id':X_new.id,'cuisine':X_new_predict}).set_index('id').to_csv('sub_samli_03_gridsearch.csv')

- 0.76297

In [18]:
print('a')

a
