In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [2]:
import os
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [3]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Remove rows with missing target, separate target from predictors
train_data.dropna(axis=0, subset=["Survived"], inplace=True)
y = train_data.Survived
train_data.drop(["Survived"], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y,
                                                        train_size=0.8, test_size=0.2,
                                                        random_state=0)

# Select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X_train.columns if
                     X_train[cname].nunique() < 10 and
                     X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                  X_train[cname].dtype in ["int64", "float64"]]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test = test_data[my_cols].copy()

In [5]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="mean")

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])


# Make a copy to avoid changing original data
X_train_eval = X_train.copy()
X_valid_eval=X_valid.copy()
X_test_eval = X_test.copy()
# Bundle preprocessing only in a temp pipeline
eval_set_pipe = Pipeline(steps = [('preprocessor', preprocessor)])
# fit transform X_valid.copy()
X_train_eval = eval_set_pipe.fit(X_train).transform (X_train_eval)
X_valid_eval = eval_set_pipe.fit(X_train).transform (X_valid_eval)
X_test_eval = eval_set_pipe.fit(X_train).transform (X_test_eval)

# Define model
model = XGBRegressor(n_estimators = 1000,
                     learning_rate = 0.05,
                     early_stopping_rounds = 10)


# Preprocessing of training data, fit model
model.fit(X_train_eval, y_train,
                eval_set=[(X_valid_eval, y_valid)],
                verbose = False)

predictions = model.predict(X_test_eval)

In [6]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
