In [None]:
#!pip install h2o
#please note that h2o requires JAVA

In [None]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
import pandas as pd
# Initialize H2O
h2o.init()

In [None]:
# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")


In [None]:
# Remove unnecessary columns
train_data = train_data.drop(["PassengerId", "Name"], axis=1)
test_data = test_data.drop(["PassengerId", "Name"], axis=1)

# Convert data to H2OFrame
train_data = h2o.H2OFrame(train_data)
test_data = h2o.H2OFrame(test_data)

# Define the features and target
features = train_data.columns
target = "Transported"
features.remove(target)

In [None]:
# Create and train the H2O Random Forest Classifier
rf_classifier = H2ORandomForestEstimator(seed=42)
rf_classifier.train(x=features, y=target, training_frame=train_data)

# Validate the model
val_data = train_data.split_frame(ratios=[0.8], seed=42)[1]
val_pred = rf_classifier.predict(val_data)
accuracy = (val_pred["predict"] == val_data[target]).as_data_frame().mean()[0]
print(f"Validation accuracy: {accuracy:.2f}")

# Predict the 'Transported' column for the test data
test_data["Transported"] = rf_classifier.predict(test_data)["predict"]

In [None]:
# Create the submission file
submission = pd.read_csv("sample_submission.csv")
submission["Transported"] = test_data["Transported"].as_data_frame()
submission.to_csv("submission.csv", index=False)

# Shutdown H2O
h2o.cluster().shutdown()