In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
# import missingno as msno

# 1 About dataset

This dataset is taken form kaggle (https://www.kaggle.com/competitions/spaceship-titanic)

train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
  * PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
  * HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
  * CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
  * Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
  * Destination - The planet the passenger will be debarking to.
  * Age - The age of the passenger.
  * VIP - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
  * Name - The first and last names of the passenger.
  * Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [4]:
data_raw = pd.read_csv('train.csv')
data_raw_test = pd.read_csv('test.csv')

data_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [5]:
#проверим пропуски
{key:data_raw[key].isna().sum() for key in data_raw.columns}

{'PassengerId': 0,
 'HomePlanet': 201,
 'CryoSleep': 217,
 'Cabin': 199,
 'Destination': 182,
 'Age': 179,
 'VIP': 203,
 'RoomService': 181,
 'FoodCourt': 183,
 'ShoppingMall': 208,
 'Spa': 183,
 'VRDeck': 188,
 'Name': 200,
 'Transported': 0}

In [88]:
data_pre = data_raw.copy()
data_pre_test = data_raw_test.copy()

#возраст заменим медианным - самое простое
median_age = data_pre['Age'].median()
median_age_test = data_pre_test['Age'].median()

data_pre['Age'].fillna(median_age, inplace=True)
data_pre_test['Age'].fillna(median_age_test, inplace=True)
data_pre

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [34]:
# Planet will be filled in:
# if there is group - by planet of other members
# if it is solo traveler - most common planet in dataset
#
#
# groups_with_nan = data_pre[(data_pre['HomePlanet'].isna() == True) & ((data_pre['grid']) > 1)]['grp']
# groups_with_nan = list(groups_with_nan)
# for i in range(len(groups_with_nan)):
#     #check if all in group with nan planet
#     planet_goup_qnty = set(data_pre[data_pre['grp'] == groups_with_nan[i]]['HomePlanet'])
#     planet_goup_qnty = list({x for x in planet_goup_qnty if x==x})
#     if len(planet_goup_qnty) == 0:
#         planet_fillin = data_pre[:]['HomePlanet'].mode()[0]
#         data_pre.loc[data_pre['grp'] == groups_with_nan[i], 'HomePlanet'] = data_pre[data_pre['grp'] == groups_with_nan[i]]['HomePlanet'].replace(np.nan, planet_fillin)
#     else:
#         planet_fillin = data_pre[data_pre['grp'] == groups_with_nan[i]]['HomePlanet'].mode()[0]
#         data_pre.loc[data_pre['grp'] == groups_with_nan[i], 'HomePlanet'] = data_pre[data_pre['grp'] == groups_with_nan[i]]['HomePlanet'].replace(np.nan, planet_fillin)
#     print(data_pre.loc[data_pre['grp'] == groups_with_nan[i], 'HomePlanet'])
#
# planet_fillin_for_df = data_pre[:]['HomePlanet'].mode()[0]
# data_pre['HomePlanet'] = data_pre['HomePlanet'].replace(np.nan, planet_fillin_for_df)

In [35]:
def grouping_rule (data_pre,data_column):

    groups_with_nan = data_pre[(data_pre[data_column].isna() == True) & ((data_pre['grid']) > 1)]['grp']
    groups_with_nan = list(groups_with_nan)
    for i in range(len(groups_with_nan)):
        #check if all in group with nan planet
        planet_goup_qnty = set(data_pre[data_pre['grp'] == groups_with_nan[i]][data_column])
        planet_goup_qnty = list({x for x in planet_goup_qnty if x==x})
        if len(planet_goup_qnty) == 0:
            planet_fillin = data_pre[:][data_column].mode()[0]
            data_pre.loc[data_pre['grp'] == groups_with_nan[i], data_column] = data_pre[data_pre['grp'] == groups_with_nan[i]][data_column].replace(np.nan, planet_fillin)
        else:
            planet_fillin = data_pre[data_pre['grp'] == groups_with_nan[i]][data_column].mode()[0]
            data_pre.loc[data_pre['grp'] == groups_with_nan[i], data_column] = data_pre[data_pre['grp'] == groups_with_nan[i]][data_column].replace(np.nan, planet_fillin)
        print(data_pre.loc[data_pre['grp'] == groups_with_nan[i], data_column])

    planet_fillin_for_df = data_pre[:][data_column].mode()[0]
    data_pre[data_column] = data_pre[data_column].replace(np.nan, planet_fillin_for_df)

def ohe_data (data_pre, column):
    # one_hot_all = pd.DataFrame()
    for i in column:
        one_hot = pd.get_dummies(data_pre[i])
        # one_hot_all = one_hot_all.join(one_hot)
        data_pre = data_pre.join(one_hot)
        data_pre = data_pre.drop(i,axis = 1)
    return data_pre



In [89]:
data_pre[['grp','grid']] = data_pre['PassengerId'].str.split('_',expand=True)
data_pre['grp'] = data_pre['grp'].astype(int)
data_pre['grid'] = data_pre['grid'].astype(int)


grouping_rule (data_pre,data_column='HomePlanet')
grouping_rule (data_pre,data_column='Destination')
grouping_rule (data_pre,data_column='Cabin')
grouping_rule (data_pre,data_column='VIP')
grouping_rule (data_pre,data_column='CryoSleep')


# data_pre['RoomService'].fillna(data_pre['RoomService'].median(axis=0), inplace=True)
# data_pre['FoodCourt'].fillna(data_pre['FoodCourt'].median(axis=0), inplace=True)
# data_pre['ShoppingMall'].fillna(data_pre['ShoppingMall'].median(axis=0), inplace=True)
# data_pre['Spa'].fillna(data_pre['Spa'].median(axis=0), inplace=True)
# data_pre['VRDeck'].fillna(data_pre['VRDeck'].median(axis=0), inplace=True)

data_pre[['deck','num','side']] = data_pre['Cabin'].str.split('/',expand=True)
data_pre.drop(['PassengerId','Name','Cabin'], axis=1, inplace=True)

# data_pre = ohe_data(data_pre=data_pre, column=['deck','side','HomePlanet','Destination'])
#
# data_pre = data_pre.replace([True],1)
# data_pre = data_pre.replace([False],0)

{key:data_pre[key].isna().sum() for key in data_pre.columns}
#####################################################################################################################
data_pre_test[['grp','grid']] = data_pre_test['PassengerId'].str.split('_',expand=True)
data_pre_test['grp'] = data_pre_test['grp'].astype(int)
data_pre_test['grid'] = data_pre_test['grid'].astype(int)


grouping_rule (data_pre=data_pre_test,data_column='HomePlanet')
grouping_rule (data_pre=data_pre_test,data_column='Destination')
grouping_rule (data_pre=data_pre_test,data_column='Cabin')
grouping_rule (data_pre=data_pre_test,data_column='VIP')
grouping_rule (data_pre=data_pre_test,data_column='CryoSleep')


# data_pre['RoomService'].fillna(data_pre['RoomService'].median(axis=0), inplace=True)
# data_pre['FoodCourt'].fillna(data_pre['FoodCourt'].median(axis=0), inplace=True)
# data_pre['ShoppingMall'].fillna(data_pre['ShoppingMall'].median(axis=0), inplace=True)
# data_pre['Spa'].fillna(data_pre['Spa'].median(axis=0), inplace=True)
# data_pre['VRDeck'].fillna(data_pre['VRDeck'].median(axis=0), inplace=True)

data_pre_test[['deck','num','side']] = data_pre_test['Cabin'].str.split('/',expand=True)
data_pre_test.drop(['PassengerId','Name','Cabin'], axis=1, inplace=True)

# data_pre = ohe_data(data_pre=data_pre, column=['deck','side','HomePlanet','Destination'])
#
# data_pre = data_pre.replace([True],1)
# data_pre = data_pre.replace([False],0)

58    Mars
59    Mars
Name: HomePlanet, dtype: object
404    Mars
405    Mars
Name: HomePlanet, dtype: object
406    Earth
407    Earth
Name: HomePlanet, dtype: object
437    Mars
438    Mars
Name: HomePlanet, dtype: object
466    Europa
467    Europa
468    Europa
469    Europa
470    Europa
471    Europa
Name: HomePlanet, dtype: object
499    Europa
500    Europa
501    Europa
502    Europa
503    Europa
Name: HomePlanet, dtype: object
504    Mars
505    Mars
Name: HomePlanet, dtype: object
522    Earth
523    Earth
524    Earth
Name: HomePlanet, dtype: object
567    Earth
568    Earth
Name: HomePlanet, dtype: object
735    Europa
736    Europa
737    Europa
738    Europa
Name: HomePlanet, dtype: object
847    Mars
848    Mars
Name: HomePlanet, dtype: object
960    Earth
961    Earth
962    Earth
963    Earth
Name: HomePlanet, dtype: object
1752    Earth
1753    Earth
1754    Earth
1755    Earth
1756    Earth
1757    Earth
Name: HomePlanet, dtype: object
1915    Earth
1916    Earth
1

In [37]:
data_pre



Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,grp,grid,deck,num,side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,1,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,1,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,1,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,2,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,1,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,9276,1,A,98,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,9278,1,G,1499,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,9279,1,G,1500,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,9280,1,E,608,S


# 2 построение моделей
- оптимизируемся на разные метрики
- разные типы кросс-валидации
- разная предобработка категориальных признаков
- разный тип перебора гиперпараметров
- отбор признаков
- KNN, лин. модели, лин с регуляризацией, ансамбли (случайный лес, 3 вида бустингов(от 3 разных компаний))

In [38]:
y = data_pre['Transported']
X = data_pre.drop(['Transported'], axis=1)

In [39]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3,
                                                          random_state=17)

In [44]:
import category_encoders as ce
from sklearn.metrics import (roc_auc_score, recall_score, f1_score, precision_score,
                             accuracy_score)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from catboost import Pool, cv
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers as L         # Уже готовые слои для моделей
from tensorflow.keras.models import Sequential   # Специальный класс для склеивания слоёв
from tensorflow.keras.models import Model        # Альтернативный класс для склейки слоёв
import tensorflow.keras.optimizers as opt        # Разные оптимизационные алгоритмы :3
from keras.wrappers.scikit_learn import KerasClassifier




import warnings
warnings.filterwarnings('ignore')


#преобразование переменных

categorical_features = ['deck','side','HomePlanet',
                        'Destination'
                        ]
# categorical_features = ["installs_interval", 'genre_id', 'country']
for col_cat in categorical_features:
    X_train[col_cat] = X_train[col_cat].astype("category")
    X_holdout[col_cat] = X_holdout[col_cat].astype("category")

numeric_features = [i for i in X_train.columns if i not in categorical_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_std=True, with_mean=True))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', ce.OneHotEncoder(use_cat_names=True))])

categorical_transformer_catboost = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# categorical_transformer = OneHotEncoder(drop='if_binary')
# categorical_transformer = DataFrameOneHotEncoder(col_overrule_params={"in_app_purchase":{"drop":"first"}})
# categorical_transformer = ce.GLMMEncoder()
# categorical_transformer = ce.CatBoostEncoder()



preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
# Catboost preprocessor
preprocessor_catboost = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer_catboost, categorical_features)])

X_train_prep = preprocessor.fit_transform(X_train)
X_holdout_prep = preprocessor.transform(X_holdout)


#выбор модели

# Logistic regression

pipe_logistic = Pipeline([('scl', preprocessor),
                          ('clf', LogisticRegression(penalty="l2",
                                                     solver='liblinear'))
                          ])  # pipeline with all steps
param_dict_logistic = {'clf__C': np.linspace(0.01, 10, 1000)
                       }

#кросс-валидация, подбор гипер-параметров
skf = StratifiedKFold(n_splits=5)  # CV type
logistic_randomized_pipe = RandomizedSearchCV(estimator=pipe_logistic,
                                              param_distributions=param_dict_logistic,
                                              cv=skf, n_iter=30, n_jobs=-1)

#Random Forest

pipe_rndforest = Pipeline([('scl', preprocessor),
                           ('clf', RandomForestClassifier( random_state=13))
                           ])  # pipeline with all steps
param_dict_rndforest = {'clf__max_depth': np.arange(1,10),
                        'clf__min_samples_leaf': np.arange(1, 10),
                        'clf__n_estimators': [100, 200, 300]
                        }

#кросс-валидация, подбор гипер-параметров
# skf = StratifiedKFold(n_splits=5)  # CV type
rndforest_randomized_pipe = RandomizedSearchCV(estimator=pipe_rndforest,
                                               param_distributions=param_dict_rndforest,
                                               cv=skf, n_iter=30, n_jobs=-1)


#KNN

pipe_knn = Pipeline([('scl', preprocessor),
                     ('clf', KNeighborsClassifier())
                     ])  # pipeline with all steps
param_dist_knn = {'clf__n_neighbors': np.arange(1,20),
                  'clf__p': np.arange(1, 5)
                  }

# #кросс-валидация, подбор гипер-параметров
# skf = StratifiedKFold(n_splits=5)  # CV type
knn_randomized_pipe = GridSearchCV(estimator=pipe_knn,
                                   param_grid=param_dist_knn,
                                   cv=skf, n_jobs=-1)



#CatBoost

# pipe_catboost = Pipeline([('scl', preprocessor_catboost),
#                            ('clf', CatBoostClassifier(random_state=13, cat_features=categorical_features))
pipe_catboost = Pipeline([('scl', preprocessor),
                          ('clf', CatBoostClassifier(random_state=13))
                          ])  # pipeline with all steps
param_dict_catboost = {'max_depth': np.arange(1,10),
                       'n_estimators': [100, 200, 300],
                       'learning_rate': np.linspace(0.01, 0.3, 10),
                       'l2_leaf_reg': np.linspace(0.01, 0.5, 10),
                       'min_data_in_leaf': np.arange(1, 10)
                       }
param_dict_catboost = {"clf__" + key: value for key, value in param_dict_catboost.items()}

catboost_randomized_pipe = RandomizedSearchCV(estimator=pipe_catboost,
                                              param_distributions=param_dict_catboost,
                                              cv=skf, n_iter=30, n_jobs=-1)




#XGB

# pipe_catboost = Pipeline([('scl', preprocessor_catboost),
#                            ('clf', CatBoostClassifier(random_state=13, cat_features=categorical_features))
pipe_xgb = Pipeline([('scl', preprocessor),
                     ('clf', XGBClassifier(random_state=13))
                     ])  # pipeline with all steps
param_dict_xgb = {'max_depth': np.arange(1,10),
                  'n_estimators': [100, 200, 300],
                  'learning_rate': np.linspace(0.01, 0.3, 10),
                  'l2_leaf_reg': np.linspace(0.01, 0.5, 10),
                  'min_data_in_leaf': np.arange(1, 10)
                  }
param_dict_xgb = {"clf__" + key: value for key, value in param_dict_xgb.items()}

xgb_randomized_pipe = RandomizedSearchCV(estimator=pipe_xgb,
                                         param_distributions=param_dict_xgb,
                                         cv=skf, n_iter=30, n_jobs=-1)


#Neural


def get_new_model( ):


    ###########################################################
    # Ваш код!
    model = Sequential(name = 'Archibald')  # модели можно дать имя!

    # Добавляем в нашу модель первый слой из 25 нейронов
    model.add(L.Dense(25, input_dim = X_train_prep.shape[1], kernel_initializer='random_normal'))

    # Добавляем функцию активации на первый слой
    model.add(L.Activation('sigmoid'))

    # Добавляем ещё один слой из 25 нейронов
    model.add(L.Dense(25, kernel_initializer='random_normal'))
    model.add(L.Activation('sigmoid'))

    # На выходе мы должны получить вероятности того, что объект относится к разным классам
    # Сделать такое преобразование позволяет softmax как функция активации
    # На выход будет идти 4 вероятности по числу классов
    model.add(L.Dense(2, activation='softmax', kernel_initializer = 'random_normal'))

    ###########################################################

    # В качестве оптимизации будем использовать Adam
    # Это такой специальный градиентный спуск, обсудим его в следущий раз
    optimizer = opt.Adam(lr=1e-3)

    # Собираем модель
    model.compile(loss = 'categorical_crossentropy',
                  metrics=["accuracy"],
                  optimizer=optimizer)

    return model


clf_neuron = KerasClassifier(build_fn=get_new_model,
                             #                              **{'verbose':0,
                             #                                                         'validation_split':0.2,
                             #                                                         'epochs':300,
                             #                                                         'verbose':1}
                             )


param_dict_neural = {'batch_size':[10, 20, 40, 60, 80, 100],
                     'epochs':[10, 50, 100]}


pipe_neuron = Pipeline([('scl', preprocessor),
                        ('clf', clf_neuron)
                        ])  # pipeline with all steps

# grid = GridSearchCV(estimator=pipe_neural, param_grid=param_dict_neural, n_jobs=-1, cv=3)

param_dict_neural = {"clf__" + key: value for key, value in param_dict_neural.items()}

neuron_randomized_pipe = GridSearchCV(estimator=pipe_neuron,
                                      param_grid=param_dict_neural,
                                      cv=skf, n_jobs=-1)










In [45]:
neuron_randomized_pipe.fit(X_train, y_train)


exception calling callback for <Future at 0x256e8f02ac0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "E:\anaconda\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "E:\anaconda\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "E:\anaconda\lib\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "E:\anaconda\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "E:\anaconda\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "E:\anaconda\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "E:\anaconda\lib\site-packages\joblib\externals\loky\reusable_executor.

KeyboardInterrupt: 

In [47]:
logistic_randomized_pipe.fit(X_train,y_train)
rndforest_randomized_pipe.fit(X_train,y_train)
knn_randomized_pipe.fit(X_train,y_train)
catboost_randomized_pipe.fit(X_train ,y_train)
xgb_randomized_pipe.fit(X_train ,y_train)
pipe_neuron.fit(X_train, y_train)

0:	learn: 0.6228991	total: 148ms	remaining: 44.3s
1:	learn: 0.5683521	total: 158ms	remaining: 23.5s
2:	learn: 0.5256346	total: 168ms	remaining: 16.6s
3:	learn: 0.5033887	total: 178ms	remaining: 13.2s
4:	learn: 0.4737976	total: 188ms	remaining: 11.1s
5:	learn: 0.4557570	total: 198ms	remaining: 9.71s
6:	learn: 0.4440677	total: 208ms	remaining: 8.69s
7:	learn: 0.4328975	total: 218ms	remaining: 7.94s
8:	learn: 0.4247929	total: 227ms	remaining: 7.35s
9:	learn: 0.4182292	total: 231ms	remaining: 6.71s
10:	learn: 0.4064162	total: 241ms	remaining: 6.34s
11:	learn: 0.4001938	total: 252ms	remaining: 6.04s
12:	learn: 0.3946788	total: 262ms	remaining: 5.78s
13:	learn: 0.3913897	total: 266ms	remaining: 5.44s
14:	learn: 0.3846684	total: 276ms	remaining: 5.24s
15:	learn: 0.3767472	total: 286ms	remaining: 5.08s
16:	learn: 0.3725360	total: 296ms	remaining: 4.93s
17:	learn: 0.3695357	total: 306ms	remaining: 4.79s
18:	learn: 0.3646980	total: 316ms	remaining: 4.67s
19:	learn: 0.3608456	total: 326ms	remaini

Pipeline(steps=[('scl',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['CryoSleep', 'Age', 'VIP',
                                                   'RoomService', 'FoodCourt',
                                                   'ShoppingMall', 'Spa',
                                                   'VRDeck', 'grp', 'grid',
                                                   'num']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   Simpl

In [78]:
# выводим результат

models_quality = pd.DataFrame( columns = ['Name', 'accuracy_score','recall_score','f1_score',
                                          'precision_score','roc_auc_score'])


models_names = [logistic_randomized_pipe, rndforest_randomized_pipe, knn_randomized_pipe,
                catboost_randomized_pipe, xgb_randomized_pipe,
                # pipe_neuron
                ]
models_string = ['logistic_randomized_pipe', 'rndforest_randomized_pipe', 'knn_randomized_pipe',
                 'catboost_randomized_pipe',
                 'xgb_randomized_pipe',
                 # 'pipe_neuron'
                 ]


for i in range(len(models_names)):
    model = models_names[i]
    # log_pred = np.round(model.predict(X_holdout), 0)
    # log_pred_proba = model.predict_proba(X_holdout)[:,1]
    log_pred = model.predict(X_holdout)
    log_pred_proba = model.predict_proba(X_holdout)[:,1]
    models_quality = models_quality.append({'Name':models_string[i],
                                            'accuracy_score':accuracy_score(y_holdout, log_pred),
                                            'recall_score':recall_score(y_holdout, log_pred),
                                            'f1_score':f1_score(y_holdout, log_pred),
                                            'precision_score':precision_score(y_holdout, log_pred)}, ignore_index=True)
                                            # 'roc_auc_score':roc_auc_score(y_holdout, log_pred_proba)},
                                           # ignore_index=True)


TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[False  True] and y_pred=['False' 'True']. Make sure that the predictions provided by the classifier coincides with the true labels.

In [84]:

print ('logistic_randomized_pipe:', accuracy_score(y_holdout, logistic_randomized_pipe.predict(X_holdout)))
print ('rndforest_randomized_pipe:', accuracy_score(y_holdout, rndforest_randomized_pipe.predict(X_holdout)))
print ('knn_randomized_pipe:', accuracy_score(y_holdout, knn_randomized_pipe.predict(X_holdout)))
# print ('catboost_randomized_pipe:', accuracy_score(y_holdout, catboost_randomized_pipe.predict(X_holdout)))
print ('xgb_randomized_pipe:', accuracy_score(y_holdout, xgb_randomized_pipe.predict(X_holdout)))

logistic_randomized_pipe: 0.7910276073619632
rndforest_randomized_pipe: 0.817101226993865
knn_randomized_pipe: 0.7967791411042945
xgb_randomized_pipe: 0.8140337423312883


In [101]:
prediction_values = pd.DataFrame()
prediction_values['1'] = logistic_randomized_pipe.predict(X_holdout)
prediction_values['2'] = rndforest_randomized_pipe.predict(X_holdout)
prediction_values['3'] = knn_randomized_pipe.predict(X_holdout)
prediction_values['4'] = xgb_randomized_pipe.predict(X_holdout)

prediction_values = prediction_values.replace([True],1)
prediction_values = prediction_values.replace([False],0)

prediction_values['sum'] = np.round(prediction_values.sum(axis=1)/4).astype(int)
prediction_values

Unnamed: 0,1,2,3,4,sum
0,1,1,1,1,1
1,0,0,0,0,0
2,1,1,1,1,1
3,1,1,1,1,1
4,1,1,1,1,1
...,...,...,...,...,...
2603,1,0,0,1,0
2604,1,0,1,1,1
2605,0,0,0,0,0
2606,0,0,0,0,0


In [102]:
print ('sum:', accuracy_score(y_holdout, prediction_values['sum']))

sum: 0.8086656441717791


In [80]:
accuracy_score(y_holdout, rndforest_randomized_pipe.predict(X_holdout))

0.817101226993865

In [114]:
prediction_test_res = pd.DataFrame()
prediction_test_res['Transported'] = rndforest_randomized_pipe.predict(data_pre_test)

In [115]:
prediction_test = pd.DataFrame()
prediction_test['PassengerId'] = data_raw_test['PassengerId']
# prediction_test.concat([prediction_test, prediction_test], axis=1)
prediction_test = prediction_test.join(prediction_test_res)
prediction_test

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [118]:
prediction_test.to_csv('result.csv',index=False, sep=',')