# Исследование

Загрузим датасет

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingRegressor, BaggingRegressor, VotingClassifier, BaggingClassifier
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import accuracy_score, r2_score
from tqdm.notebook import tqdm_notebook
from sklearn.naive_bayes import CategoricalNB
tqdm_notebook.pandas()

Сделаем свой разделитель на выборки: тренировочную, тестовую и валидационную

In [2]:
class TrainValidationTest:
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)
        return X_train, X_valid, X_test, y_train, y_valid, y_test

Автоматический вычислитель лучшей модели с помощью GridSearchCV и cross validation, к сожалению он не позволяет выбрать оптимальную модель, т.е. сильно долго работающие модели будут тоже учитываться в рейтинге

In [3]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.result = None
        self.grids = grids
        self.grid_dict = grid_dict

    def choose(self, X_train, y_train, X_valid, y_valid):
        out = pd.DataFrame({'model_num': self.grid_dict.keys(),
                                'model': self.grid_dict.values()})

        def select_params(row):
            ind = row['model_num']
            print(f'Estimator: {self.grid_dict[ind]}')

            self.grids[ind].fit(X_train, y_train)
            print(f'Best params: {self.grids[ind].best_params_}')
            y_predict = self.grids[ind].predict(X_valid)

            score = r2_score(y_valid, y_predict)
            print(f'Best training r2 score: {self.grids[ind].best_score_}')
            print(f'Validation set r2 score for best params: {score}')
            print('-'*55)

            return self.grids[ind].best_params_, score


        out[['params', 'valid_score']] = out.progress_apply(select_params,
                                                                axis=1,
                                                                result_type='expand')
        self.result = out
        best_clf = out.model[out.valid_score.idxmax()]

        print(f'Regressor with best validation set r2 score: {best_clf}')

    def best_results(self):
        return self.result.drop(columns='model_num')

Убираем все поля, которые не относится к ингредиентам

In [4]:
data = pd.read_csv('epi_r.csv')
data = data.drop(columns=['#cakeweek', '#wasteless', '22-minute meals', '3-ingredient recipes', '30 days of groceries', 'advance prep required', 'alaska', 'alabama', 'alcoholic', 'non-alcoholic', 'no meat, no problem', 'anniversary', 'anthony bourdain', 'appetizer', 'arizona', 'aspen', 'atlanta', 'australia', 'back to school', 'backyard bbq', 'bake', 'bastille day', 'birthday', 'blender', 'boil', 'bon appétit', 'bon app��tit', 'boston', 'breakfast', 'brooklyn', 'buffet', 'bulgaria', 'burrito', 'cake', 'california', 'cambridge', 'camping', 'canada', 'caraway', 'champagne', 'chicago', 'chile', 'chill', 'christmas', 'christmas eve', 'cinco de mayo', 'cocktail', 'cocktail party', 'house cocktail', 'colorado', 'columbus', 'connecticut', 'cook like a diner', 'cookbook critic', 'cookie', 'cookies', 'cornmeal', 'costa mesa', 'cr��me de cacao', 'cuba', 'cupcake', 'dairy', 'dairy free', 'dallas', 'date', 'deep-fry', 'denver', 'dessert', 'digestif', 'dip', 'diwali', 'dominican republic', 'double boiler', 'dorie greenspan', 'drink', 'drinks', 'easter', 'eau de vie', 'edible gift', 'egg nog', 'eggplant', 'egypt', 'emeril lagasse', 'england', 'engagement party', 'entertaining', 'epi + ushg', 'epi loves the microwave', 'fall', 'family reunion', 'fat free', 'father\'s day', 'flaming hot summer', 'florida', 'food processor', 'fourth of july', 'france', 'frankenrecipe', 'freeze/chill', 'freezer food', 'friendsgiving', 'frittata', 'fritter', 'frozen dessert', 'fry', 'fruit', 'game', 'georgia', 'germany', 'graduation', 'grill', 'grill/barbecue', 'oregon', 'organic', 'oscars', 'oyster', 'pacific palisades', 'paleo', 'pan-fry', 'pancake', 'party', 'pasadena', 'pasta', 'pasta maker', 'pastry', 'peanut free', 'pennsylvania', 'pernod', 'persian new year', 'peru', 'pescatarian', 'philippines', 'phyllo/puff pastry dough', 'picnic', 'pie', 'pittsburgh', 'poach', 'poker/game night', 'port', 'portland', 'pot pie', 'potato salad', 'potluck', 'poultry', 'poultry sausage', 'pressure cooker', 'providence', 'quick & easy', 'quinoa', 'ramadan', 'ramekin', 'raw', 'rhode island', 'roast', 'sage', 'salad', 'salad dressing', 'san francisco', 'sandwich', 'sandwich theory', 'seafood', 'seattle', 'self', 'shavuot', 'shower', 'side', 'simmer', 'skewer', 'slow cooker', 'smoker', 'smoothie', 'snapper', 'soup/stew', 'south carolina', 'soy free', 'spain', 'spring', 'spritzer', 'st. patrick\'s day', 'st. louis', 'steam', 'stew', 'stir-fry', 'stock', 'stuffing/dressing', 'summer', 'super bowl', 'suzanne goin', 'switzerland', 'taco', 'tailgating', 'tennessee', 'tested & improved', 'texas', 'thanksgiving', 'tree nut free', 'utah', 'valentine\'s day', 'vegetarian', 'vermont', 'vermouth', 'virginia', 'waffle', 'washington', 'washington, d.c.', 'wedding', 'weelicious', 'west virginia', 'westwood', 'wheat/gluten-free', 'windsor', 'winter', 'wisconsin', 'wok', 'yonkers', 'cookbooks', 'leftovers', 'snack', 'snack week', 'beverly hills', 'quick and healthy', 'mother\'s day', 'nebraska', 'low cholesterol'])
data = data.dropna()
data

Unnamed: 0,title,rating,calories,protein,fat,sodium,almond,amaretto,anchovy,anise,...,whiskey,white wine,whole wheat,wild rice,wine,yellow squash,yogurt,yuca,zucchini,turkey
0,"Lentil, Apple, and Turkey Wrap",2.500,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.750,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,The Best Blts,4.375,948.0,19.0,79.0,1042.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20047,Parmesan Puffs,3.125,28.0,2.0,2.0,64.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20048,Artichoke and Parmesan Risotto,4.375,671.0,22.0,28.0,583.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20049,Turkey Cream Puff Pie,4.375,563.0,31.0,38.0,652.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20050,Snapper on Angel Hair with Citrus Cream,4.375,631.0,45.0,24.0,517.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Немного о датасете: здесь мы видим:
- названия рецептов
- рейтинг (вероятнто посчитан на какой-то платформе-соцсети, где пользователи обмениваются рецепты)
- количество Ккал в блюде
- количество протеина в блюде, по моему, миллиграммы
- жирность блюда
- количество содиума
- И много-много столбцов, названия которых это название ингредиента, т.е. в если в блюде есть этот ингредиент, в его столбце стоит единица


Делаем целевую переменную - рейтинг блюда и общую выборку

In [5]:
y = data.rating.round(2)
X = data.drop(columns=['rating', 'title', 'calories', 'protein', 'fat', 'sodium'])

In [6]:
y

0        2.50
1        4.38
2        3.75
4        3.12
5        4.38
         ... 
20047    3.12
20048    4.38
20049    4.38
20050    4.38
20051    4.38
Name: rating, Length: 15864, dtype: float64

In [7]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest().transform(X, y)

# Регрессия

Попробуем сделать регрессию с регуляризацией через параметр alpha, он же лямбда из формулы, и рассчитать значение с точностью до 2 знаков до запятой

In [9]:
ridge_params = {'alpha': range(30, 75)}

grid_ridge = GridSearchCV(estimator=Ridge(random_state=21), param_grid=ridge_params, cv=5, n_jobs=8)

# ---------------------------------------------------------------------------------------------------------------------------------------------------------

lasso_params = {'selection': ['cyclic', 'random'], 'alpha': range(15, 40), 'tol': [0.2, 0.002]}

grid_lasso = GridSearchCV(estimator=Lasso(random_state=21), param_grid=lasso_params, cv=5, n_jobs=8)

Заводим гриды и их обозначение в переменные

In [10]:
grids = [grid_ridge, grid_lasso]

grid_dict = {0: 'Ridge', 1: 'Lasso'}

In [11]:
select_mod = ModelSelection(grids, grid_dict)

Высчитываем лучшую модель

In [12]:
select_mod.choose(X_train, y_train, X_valid, y_valid)

  0%|          | 0/2 [00:00<?, ?it/s]

Estimator: Ridge
Best params: {'alpha': 30}
Best training r2 score: 0.10379502100778604
Validation set r2 score for best params: 0.10883405562473392
-------------------------------------------------------
Estimator: Lasso
Best params: {'alpha': 15, 'selection': 'cyclic', 'tol': 0.2}
Best training r2 score: -0.0006342399590497205
Validation set r2 score for best params: -1.4536590131619676e-07
-------------------------------------------------------
Regressor with best validation set r2 score: Ridge


Проверим RMSE на тестовой выборке

In [15]:
from sklearn.metrics import mean_squared_error, r2_score

ridge = Ridge(alpha=30, random_state=21).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'RMSE на тестовой выборке: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2 score на тестовой выборке: {r2_score(y_test, y_pred)}')

RMSE на тестовой выборке: 1.2196295186901487
R2 score на тестовой выборке: 0.1011671868227576


Попробуем ансамблевые методы, зададим им параметры

In [16]:
vote_params = {'weights': [[1, 2], [1, 5], [3, 1], [5, 2]]}

grid_vote = GridSearchCV(estimator=VotingRegressor(estimators=[('rg', Ridge(alpha=30, random_state=21)), ('ls', Lasso(alpha=0, selection='random', tol=0.2))]), param_grid=vote_params, cv=5, n_jobs=12)

# ---------------------------------------------------------------------------------------------------------------------------------------------------------

bag_params = {'n_estimators': range(10, 30)}

grid_bag = GridSearchCV(estimator=BaggingRegressor(base_estimator=Ridge(alpha=30, random_state=21)), param_grid=bag_params, cv=5, n_jobs=12)

In [17]:
grids = [grid_bag, grid_vote]

grid_dict = {0: 'Bagging', 1: 'Vote'}

In [18]:
select_mod = ModelSelection(grids, grid_dict)

In [19]:
select_mod.choose(X_train, y_train, X_valid, y_valid)

  0%|          | 0/2 [00:00<?, ?it/s]

Estimator: Bagging
Best params: {'n_estimators': 20}
Best training r2 score: 0.10456328176647552
Validation set r2 score for best params: 0.10564005819957778
-------------------------------------------------------
Estimator: Vote


  estimator.fit(X, y)
  model = cd_fast.enet_coordinate_descent(


Best params: {'weights': [3, 1]}
Best training r2 score: 0.10185532072000483
Validation set r2 score for best params: 0.10741739200631295
-------------------------------------------------------
Regressor with best validation set r2 score: Vote


  model = cd_fast.enet_coordinate_descent(


Получили результаты, смотрим RMSE на тестовой выборке

In [21]:
bag = VotingRegressor(estimators=[('rg', Ridge(alpha=30, random_state=21)), ('ls', Lasso(alpha=0, selection='random', tol=0.2))], weights=[3, 1]).fit(X_train, y_train)
y_pred = bag.predict(X_test)
print(f'RMSE на тестовой выборке: {mean_squared_error(y_test, y_pred, squared=False)}')
print(f'R2 score на тестовой выборке: {r2_score(y_test, y_pred)}')

  estimator.fit(X, y)
  model = cd_fast.enet_coordinate_descent(


RMSE на тестовой выборке: 1.2208799943847632
R2 score на тестовой выборке: 0.09932311078680178


  model = cd_fast.enet_coordinate_descent(


Получилось оч плохо, процентный показатель R2 показывает плохие результаты, регрессия пока выглядит как плохой вариант

# Классификация

Попробуем классификацию, для этого округлим значения оценки до классов: (0, 1, 2, 3, 4, 5)

In [7]:
y = y.round()
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest().transform(X, y)

Поменяем метрику r2_score на accuracy score для классификации

In [7]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.result = None
        self.grids = grids
        self.grid_dict = grid_dict

    def choose(self, X_train, y_train, X_valid, y_valid):
        out = pd.DataFrame({'model_num': self.grid_dict.keys(),
                                'model': self.grid_dict.values()})

        def select_params(row):
            ind = row['model_num']
            print(f'Estimator: {self.grid_dict[ind]}')

            self.grids[ind].fit(X_train, y_train)
            print(f'Best params: {self.grids[ind].best_params_}')
            y_predict = self.grids[ind].predict(X_valid)

            score = accuracy_score(y_valid, y_predict)

            print(f'Best training accuracy: {self.grids[ind].best_score_}')
            print(f'Validation set accuracy score for best params: {score}')
            print('-'*55)

            return self.grids[ind].best_params_, score


        out[['params', 'valid_score']] = out.progress_apply(select_params,
                                                                axis=1,
                                                                result_type='expand')
        self.result = out
        best_clf = out.model[out.valid_score.idxmax()]

        print(f'Classifier with best validation set accuracy: {best_clf}')

    def best_results(self):
        return self.result.drop(columns='model_num')

Попробуем три алгоритма: деревья решений, случайный лес и градиентный бустинг

In [21]:
dec_tree_params = {'max_depth': range(1, 50), 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')}

grid_tree = GridSearchCV(estimator=DecisionTreeClassifier(random_state=21), param_grid = dec_tree_params, cv=5, n_jobs=12)

# ---------------------------------------------------------------------------------------------------------------------------------------------------------

forest_params = {'n_estimators': [ 160, 170, 165, 195], 'max_depth': range(30, 50), 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')}

gs_ran_for = GridSearchCV(estimator=RandomForestClassifier(random_state=21), param_grid = forest_params, cv=5, n_jobs=12)

# ---------------------------------------------------------------------------------------------------------------------------------------------------------

boosting_params = {'loss': ('log_loss', 'deviance'), 'n_estimators': [50, 100], 'criterion': ('friedman_mse', 'squared_error'), 'max_depth': [20, 22]}

gs_gr_boosting = GridSearchCV(estimator=GradientBoostingClassifier(random_state=21), param_grid=boosting_params, cv=5, n_jobs=12)

In [22]:
grids = [gs_ran_for, grid_tree, gs_gr_boosting]

grid_dict = {0: 'Random Forest', 1: 'Grid tree', 2: 'Gradient boosting'}

In [23]:
select_mod = ModelSelection(grids, grid_dict)

In [24]:
select_mod.choose(X_train, y_train, X_valid, y_valid)

  0%|          | 0/3 [00:00<?, ?it/s]

Estimator: Random Forest
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 49, 'n_estimators': 170}
Best training accuracy: 0.7081362041072732
Validation set accuracy score for best params: 0.7073651043717999
-------------------------------------------------------
Estimator: Grid tree
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 12}
Best training accuracy: 0.6827228694156825
Validation set accuracy score for best params: 0.6809767625049232
-------------------------------------------------------
Estimator: Gradient boosting
Best params: {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_depth': 20, 'n_estimators': 100}
Best training accuracy: 0.6872535308627603
Validation set accuracy score for best params: 0.6857030326900354
-------------------------------------------------------
Classifier with best validation set accuracy: Random Forest


Проверим точность на тестовой выборке

In [25]:
ran_for = RandomForestClassifier(class_weight='balanced', random_state=21, criterion='gini', max_depth=40, n_estimators=195, n_jobs=12).fit(X_train, y_train)
y_pred = ran_for.predict(X_test)
accuracy_score(y_test, y_pred)

0.6993381657737158

Видим, что на тестовой выборке точность чутка ниже, чем на валидационной, это хорошо

Итак, точность довольно неплохая, но нет, смысла делать классификацию по такому количеству классов, так как не особо интуитивно понятно, будет ли блюдо прям совсем невкусное, если у него рейтинг 2, есть же ещё 0 и 1. Так что стоит уменьшить количество классов, сделаем это ниже

In [8]:
y = ['bad' if s in (0, 1) else 'so-so' if s in (2, 3) else 'great' for s in y]
y

['so-so',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'great',
 'bad',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'bad',
 'great',
 'so-so',
 'great',
 'great',
 'bad',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'bad',
 'great',
 'great',
 'great',
 'great',
 'so-so',
 'bad',
 'great',
 'so-so',
 'so-so',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'great',
 'great',
 'so-so',
 'great',
 'great',
 'bad',
 'great',
 'bad',
 'great',
 'great',
 'great',
 'gr

Разделим заново основные датасеты

In [11]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest().transform(X, y)

Выберем лучшую модель

In [29]:
select_mod.choose(X_train, y_train, X_valid, y_valid)

  0%|          | 0/3 [00:00<?, ?it/s]

Estimator: Random Forest
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 49, 'n_estimators': 195}
Best training accuracy: 0.8238767575486365
Validation set accuracy score for best params: 0.8251280031508468
-------------------------------------------------------
Estimator: Grid tree
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 3}
Best training accuracy: 0.8124507085980117
Validation set accuracy score for best params: 0.8141000393855848
-------------------------------------------------------
Estimator: Gradient boosting
Best params: {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_depth': 22, 'n_estimators': 100}
Best training accuracy: 0.814322193197556
Validation set accuracy score for best params: 0.8200078771169752
-------------------------------------------------------
Classifier with best validation set accuracy: Random Forest


In [12]:
ran_for = RandomForestClassifier(class_weight='balanced', random_state=21, criterion='gini', max_depth=47, n_estimators=195, n_jobs=12).fit(X_train, y_train)
y_pred = ran_for.predict(X_test)
accuracy_score(y_test, y_pred)

0.823195713835487

Значение получилось выше чем в прошлый раз

Однако что будет плохо предсказать, что блюдо вкусное, когда оно на самом деле невкусное, или же что блюдо невкусное, когда оно на самом деле вкусное. Второй вариант явно, для этого поменяем accuracy на precision

In [13]:
from sklearn.metrics import precision_score

precision_score(y_test, y_pred, average='micro', labels=['great'])

0.8501211491865698

Вставим precision в наш автомтатический вычислятор

In [14]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.result = None
        self.grids = grids
        self.grid_dict = grid_dict

    def choose(self, X_train, y_train, X_valid, y_valid):
        out = pd.DataFrame({'model_num': self.grid_dict.keys(),
                                'model': self.grid_dict.values()})

        def select_params(row):
            ind = row['model_num']
            print(f'Estimator: {self.grid_dict[ind]}')

            self.grids[ind].fit(X_train, y_train)
            print(f'Best params: {self.grids[ind].best_params_}')
            y_predict = self.grids[ind].predict(X_valid)


            score = precision_score(y_valid, y_predict, average='micro', labels=['great'])
            print(f'Best training precision: {self.grids[ind].best_score_}')
            print(f'Validation set precision score for best params: {score}')
            print('-'*55)

            return self.grids[ind].best_params_, score


        out[['params', 'valid_score']] = out.progress_apply(select_params,
                                                                axis=1,
                                                                result_type='expand')
        self.result = out
        best_clf = out.model[out.valid_score.idxmax()]

        print(f'Classifier with best validation set accuracy: {best_clf}')

    def best_results(self):
        return self.result.drop(columns='model_num')

In [33]:
select_mod = ModelSelection(grids, grid_dict)
select_mod.choose(X_train, y_train, X_valid, y_valid)

  0%|          | 0/3 [00:00<?, ?it/s]

Estimator: Random Forest
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 49, 'n_estimators': 195}
Best training precision: 0.8238767575486365
Validation set precision score for best params: 0.827150428047289
-------------------------------------------------------
Estimator: Grid tree
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 3}
Best training precision: 0.8124507085980117
Validation set precision score for best params: 0.8171956609079952
-------------------------------------------------------
Estimator: Gradient boosting
Best params: {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_depth': 22, 'n_estimators': 100}
Best training precision: 0.814322193197556
Validation set precision score for best params: 0.8369839932603201
-------------------------------------------------------
Classifier with best validation set accuracy: Gradient boosting


In [15]:
ran_for = GradientBoostingClassifier(criterion='friedman_mse', loss='log_loss', max_depth=22, n_estimators=100).fit(X_train, y_train)
y_pred = ran_for.predict(X_test)
precision_score(y_test, y_pred, average='micro', labels=['great'])

0.8376733175515726

In [16]:
vote_params = {'weights': [[1, 2], [1, 5], [3, 1], [5, 2]]}

grid_vote = GridSearchCV(estimator=VotingClassifier(estimators=[('ran_for', RandomForestClassifier(class_weight=None, criterion='entropy', max_depth=49, n_estimators=195, random_state=21)), ('boosting', GradientBoostingClassifier(criterion='friedman_mse', loss='log_loss', max_depth=22, n_estimators=100))]), param_grid=vote_params, cv=5, n_jobs=12)

# ---------------------------------------------------------------------------------------------------------------------------------------------------------

bag_params = {'n_estimators': range(10, 30)}

grid_bag = GridSearchCV(estimator=BaggingClassifier(base_estimator=GradientBoostingClassifier(criterion='friedman_mse', loss='log_loss', max_depth=22, n_estimators=100)), param_grid=bag_params, cv=5, n_jobs=12)

In [17]:
grids = [grid_bag, grid_vote]

grid_dict = {0: 'Bagging', 1: 'Vote'}

In [18]:
select_mod = ModelSelection(grids, grid_dict)

In [None]:
select_mod.choose(X_train, y_train, X_valid, y_valid)

  0%|          | 0/2 [00:00<?, ?it/s]

Estimator: Bagging


In [39]:
vote = VotingClassifier(estimators=[('ran_for', RandomForestClassifier(class_weight='balanced', criterion='entropy', max_depth=45, n_estimators=160, random_state=21)), ('boosting', GradientBoostingClassifier(criterion='friedman_mse', loss='log_loss', max_depth=20, n_estimators=50, random_state=21))], weights=[3, 1]).fit(X_train, y_train)
y_pred = vote.predict(X_test)
precision_score(y_test, y_pred, average='micro', labels=['great'])

0.8480749219562955

Будем использовать эту модель, сделаем тренировку на всём объёме данных и сохраним её

In [40]:
vote = VotingClassifier(estimators=[('ran_for', RandomForestClassifier(class_weight='balanced', criterion='entropy', max_depth=45, n_estimators=160, random_state=21)), ('boosting', GradientBoostingClassifier(criterion='friedman_mse', loss='log_loss', max_depth=20, n_estimators=50, random_state=21))], weights=[3, 1]).fit(X, y)

In [41]:
import joblib

with open('vote_model.sav', 'wb') as f:
    joblib.dump(vote, f)