In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization

df = pd.read_csv('spaceship_titanic.csv')

df = df.drop(['Cabin'], axis=1)
df = df.drop(['Destination'], axis=1)
df = df.drop(['PassengerId'], axis=1)
df = df.drop(['Name'], axis=1)
df = df.drop(['VIP'], axis=1)
df = df.drop(['HomePlanet'], axis=1)
df = df.drop(['Age'], axis=1)

df.RoomService.fillna(df.RoomService.mean(), inplace = True)
df.FoodCourt.fillna(df.FoodCourt.mean(), inplace = True)
df.ShoppingMall.fillna(df.ShoppingMall.mean(), inplace = True)
df.Spa.fillna(df.Spa.mean(), inplace = True)
df.VRDeck.fillna(df.VRDeck.mean(), inplace = True)
df.CryoSleep.fillna(False, inplace = True)

df["Transported"].replace(True, 1, inplace=True)
df["Transported"].replace(False, 0, inplace=True)

df["CryoSleep"].replace(True, 1, inplace=True)
df["CryoSleep"].replace(False, 0, inplace=True)

for colname in df:
    df[colname] = (df[colname] - min(df[colname])) / (max(df[colname]) - min(df[colname]))

TargetFun = df["Transported"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.RoomService.fillna(df.RoomService.mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.FoodCourt.fillna(df.FoodCourt.mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [2]:
Ytrain, Ytest, Xtrain, Xtest = train_test_split(TargetFun, df.drop(["Transported"], axis=1), test_size=0.3, train_size=0.7)

## Случайный лес

In [3]:
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring='accuracy', cv=5)
grid_search_rf.fit(Xtrain, Ytrain)

print("Лучшие параметры RandomForest по решетке:", grid_search_rf.best_params_)

best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(Xtest)
accuracy_rf = accuracy_score(Ytest, y_pred_rf)
print(f"Точность RandomForest на тестовых данных (Grid Search): {accuracy_rf:.4f}\n")


Лучшие параметры RandomForest по решетке: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Точность RandomForest на тестовых данных (Grid Search): 0.7899



In [4]:
param_dist_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, 
                                      n_iter=10, scoring='accuracy', cv=5)
random_search_rf.fit(Xtrain, Ytrain)

print("Лучшие параметры RandomForest по случайному поиску:", random_search_rf.best_params_)

best_rf_random = random_search_rf.best_estimator_
y_pred_rf_random = best_rf_random.predict(Xtest)
accuracy_rf_random = accuracy_score(Ytest, y_pred_rf_random)
print(f"Точность RandomForest на тестовых данных (Random Search): {accuracy_rf_random:.4f}\n")


Лучшие параметры RandomForest по случайному поиску: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 10}
Точность RandomForest на тестовых данных (Random Search): 0.7903



In [5]:
def rf_evaluate(n_estimators, max_depth, min_samples_split):
    model = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth), 
                                   min_samples_split=int(min_samples_split))
    model.fit(Xtrain, Ytrain)
    pred = model.predict(Xtest)
    return accuracy_score(Ytest, pred)

pbounds_rf = {
    'n_estimators': (10, 200),
    'max_depth': (1, 20),
    'min_samples_split': (2, 10),
}

optimizer_rf = BayesianOptimization(
    f=rf_evaluate,
    pbounds=pbounds_rf,
    random_state=42,
)

optimizer_rf.maximize(init_points=5, n_iter=15)

print("Лучшие параметры RandomForest по байесовскому подходу:", optimizer_rf.max)

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.7922   [39m | [39m8.116    [39m | [39m9.606    [39m | [39m149.1    [39m |
| [39m2        [39m | [39m0.7872   [39m | [39m12.37    [39m | [39m3.248    [39m | [39m39.64    [39m |
| [39m3        [39m | [39m0.74     [39m | [39m2.104    [39m | [39m8.929    [39m | [39m124.2    [39m |
| [39m4        [39m | [39m0.7872   [39m | [39m14.45    [39m | [39m2.165    [39m | [39m194.3    [39m |
| [39m5        [39m | [39m0.7857   [39m | [39m16.82    [39m | [39m3.699    [39m | [39m44.55    [39m |
| [39m6        [39m | [39m0.7895   [39m | [39m10.27    [39m | [39m8.266    [39m | [39m167.1    [39m |
| [35m7        [39m | [35m0.7926   [39m | [35m9.53     [39m | [35m8.761    [39m | [35m149.0    [39m |
| [39m8        [39m | [39m0.7841   [39m | [39m19.34    [39m | [39m9.988    [39m | [

## XGBoost

In [6]:
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}
xgb = XGBClassifier()
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, scoring='accuracy', cv=5)
grid_search_xgb.fit(Xtrain, Ytrain)

print("Лучшие параметры XGBoost по решетке:", grid_search_xgb.best_params_)

best_xgb_grid = grid_search_xgb.best_estimator_
y_pred_xgb_grid = best_xgb_grid.predict(Xtest)
accuracy_xgb_grid = accuracy_score(Ytest, y_pred_xgb_grid)
print(f"Точность XGBoost на тестовых данных (Grid Search): {accuracy_xgb_grid:.4f}\n")

Лучшие параметры XGBoost по решетке: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}
Точность XGBoost на тестовых данных (Grid Search): 0.7910



In [7]:
param_dist_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}
random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist_xgb, 
                                       n_iter=10, scoring='accuracy', cv=5)
random_search_xgb.fit(Xtrain, Ytrain)

print("Лучшие параметры XGBoost по случайному поиску:", random_search_xgb.best_params_)

best_xgb_random = random_search_xgb.best_estimator_
y_pred_xgb_random = best_xgb_random.predict(Xtest)
accuracy_xgb_random = accuracy_score(Ytest, y_pred_xgb_random)
print(f"Точность XGBoost на тестовых данных (Random Search): {accuracy_xgb_random:.4f}\n")

Лучшие параметры XGBoost по случайному поиску: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.2}
Точность XGBoost на тестовых данных (Random Search): 0.7910



In [8]:
def xgb_evaluate(n_estimators, max_depth, learning_rate):
    model = XGBClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth), learning_rate=learning_rate)
    model.fit(Xtrain, Ytrain)
    pred = model.predict(Xtest)
    return accuracy_score(Ytest, pred)

pbounds_xgb = {
    'n_estimators': (50, 200),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.2),
}

optimizer_xgb = BayesianOptimization(
    f=xgb_evaluate,
    pbounds=pbounds_xgb,
    random_state=42,
)

optimizer_xgb.maximize(init_points=5, n_iter=15)

print("Лучшие параметры XGBoost по байесовскому подходу:", optimizer_xgb.max)

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.7887   [39m | [39m0.08116  [39m | [39m9.655    [39m | [39m159.8    [39m |
| [35m2        [39m | [35m0.791    [39m | [35m0.1237   [39m | [35m4.092    [39m | [35m73.4     [39m |
| [39m3        [39m | [39m0.7899   [39m | [39m0.02104  [39m | [39m9.063    [39m | [39m140.2    [39m |
| [39m4        [39m | [39m0.7891   [39m | [39m0.1445   [39m | [39m3.144    [39m | [39m195.5    [39m |
| [35m5        [39m | [35m0.7926   [39m | [35m0.1682   [39m | [35m4.486    [39m | [35m77.27    [39m |
| [39m6        [39m | [39m0.7891   [39m | [39m0.1196   [39m | [39m7.602    [39m | [39m79.19    [39m |
| [39m7        [39m | [39m0.7903   [39m | [39m0.1481   [39m | [39m4.522    [39m | [39m77.31    [39m |
| [35m8        [39m | [35m0.7937   [39m | [35m0.09546  [39m | [35m4.523    [39m | [