In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
# Remove rows with missing target, separate target from predictors
train_data.dropna(axis=0, subset=["Survived"], inplace=True)
y = train_data.Survived
train_data.drop(["Survived"], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y,
                                                        train_size=0.8, test_size=0.2,
                                                        random_state=0)

# Select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X_train.columns if
                     X_train[cname].nunique() < 10 and
                     X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                  X_train[cname].dtype in ["int64", "float64"]]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test = test_data[my_cols].copy()

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="mean")

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

# Define model
model = XGBRegressor(n_estimators = 1000,
                     learning_rate = 0.05,
                     early_stopping_rounds = 10)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                verbose = False)

predictions = my_pipeline.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")