In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier

In [15]:
original_train = pd.read_csv("train.csv").drop("Name", axis=1)
original_test = pd.read_csv("test.csv").drop("Name", axis=1)

train = original_train.copy()
test = original_test.copy()

le = LabelEncoder()
train["PassengerId"] = le.fit_transform(train["PassengerId"])
train["HomePlanet"] = le.fit_transform(train["HomePlanet"])
train["Cabin"] = le.fit_transform(train["Cabin"])
train["Destination"] = le.fit_transform(train["Destination"])

test["PassengerId"] = le.fit_transform(test["PassengerId"])
test["HomePlanet"] = le.fit_transform(test["HomePlanet"])
test["Cabin"] = le.fit_transform(test["Cabin"])
test["Destination"] = le.fit_transform(test["Destination"])
train.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0,1,False,149,2,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,1,0,False,2184,2,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,2,1,False,1,2,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,3,1,False,1,2,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,4,0,False,2186,2,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [5]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization


# Define the objective function to be maximized
def hgb_cv(learning_rate, max_depth, max_leaf_nodes, min_samples_leaf):
    # Define the HistGradientBoostingClassifier with the given hyperparameters
    clf = HistGradientBoostingClassifier(learning_rate=learning_rate,
                                          max_depth=int(max_depth),
                                          max_leaf_nodes=int(max_leaf_nodes),
                                          min_samples_leaf=int(min_samples_leaf),
                                          random_state=42)

    # Calculate cross-validation scores for the classifier
    scores = cross_val_score(clf, train.drop("Transported", axis=1), train["Transported"], cv=5, scoring='accuracy')

    # Return the mean cross-validation score
    return scores.mean()

# Define the hyperparameters and their respective ranges for Bayesian optimization
pbounds = {'learning_rate': (0.001, 0.1),
           'max_depth': (2, 10),
           'max_leaf_nodes': (2, 100),
           'min_samples_leaf': (1, 10)}

# Initialize the Bayesian optimizer with the objective function and hyperparameter ranges
optimizer = BayesianOptimization(f=hgb_cv, pbounds=pbounds, random_state=42)

# Run the Bayesian optimizer for 10 iterations
optimizer.maximize(init_points=5, n_iter=5)

# Print the optimal hyperparameters and corresponding score
print(optimizer.max)


|   iter    |  target   | learni... | max_depth | max_le... | min_sa... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.7168   [0m | [0m0.03808  [0m | [0m9.606    [0m | [0m73.74    [0m | [0m6.388    [0m |
| [95m2        [0m | [95m0.7789   [0m | [95m0.01645  [0m | [95m3.248    [0m | [95m7.692    [0m | [95m8.796    [0m |
| [0m3        [0m | [0m0.7279   [0m | [0m0.06051  [0m | [0m7.665    [0m | [0m4.017    [0m | [0m9.729    [0m |
| [0m4        [0m | [0m0.6838   [0m | [0m0.08341  [0m | [0m3.699    [0m | [0m19.82    [0m | [0m2.651    [0m |
| [0m5        [0m | [0m0.7208   [0m | [0m0.03112  [0m | [0m6.198    [0m | [0m44.33    [0m | [0m3.621    [0m |
| [0m6        [0m | [0m0.7316   [0m | [0m0.07732  [0m | [0m2.853    [0m | [0m9.364    [0m | [0m7.554    [0m |
| [95m7        [0m | [95m0.7856   [0m | [95m0.01169  [0m | [95m4.06     [0m | [95m7.657    [0m | [95m9.

In [6]:

X = train.drop("Transported", axis=1)
y = train["Transported"]

X_train, X_test, y_train, y_test = train_test_split(X, y)



In [7]:
model = HistGradientBoostingClassifier(learning_rate=0.01169, max_depth=4.06, max_leaf_nodes=7.657, min_samples_leaf=9.266)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)

In [9]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[838 240]
 [253 843]]
              precision    recall  f1-score   support

       False       0.77      0.78      0.77      1078
        True       0.78      0.77      0.77      1096

    accuracy                           0.77      2174
   macro avg       0.77      0.77      0.77      2174
weighted avg       0.77      0.77      0.77      2174



In [10]:
y_pred = model.predict(test)

In [11]:
df_out = test["PassengerId"]
df_out = pd.DataFrame({
    "PassengerId": original_test["PassengerId"],
    "Transported": y_pred
    })
print(df_out.head())
df_out.to_csv("sample_submission.csv", index=False)

  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01        False
