In [1]:
import pandas as pd
import logging
logging.basicConfig(
    level=logging.INFO
)

# loading data
train_data = pd.read_csv('../input/titanic/train.csv', index_col='PassengerId')
y = train_data['Survived']
X = train_data.copy()
X.drop('Survived', axis=1, inplace=True)
X_predict = pd.read_csv('../input/titanic/test.csv', index_col='PassengerId')
# X['Title'] = X['Name'].str.extract(r'(\S+)\.')



In [2]:
X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
pd.DataFrame({
    'Unique': train_data.nunique(),
    'NaN': train_data.isnull().sum(),
    'Dtypes': train_data.dtypes
})

Unnamed: 0,Unique,NaN,Dtypes
Survived,2,0,int64
Pclass,3,0,int64
Name,891,0,object
Sex,2,0,object
Age,88,177,float64
SibSp,7,0,int64
Parch,7,0,int64
Ticket,681,0,object
Fare,248,0,float64
Cabin,147,687,object


# Plan for handling NaN and Categorical data.
### Numerical features (NaN replace with median):
- Age
- Fare
### Categorical features (OneHot):
- Name extract only Title and apply OneHot
- Pclass
- Sex
- Embarked

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# split X y to train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# creating custom Class to handle Name column
class NameTitleEncoder(BaseEstimator):
    """Class to extract title from name column."""
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    @staticmethod
    def transform(x_dataset: pd.DataFrame):
        x_dataset['Title'] = x_dataset['Name'].str.extract(r'(\S+)\.')
        return x_dataset


# handling numerical features
numerical_features = [
    'Age',
    'Fare',
]
numerical_transformer = SimpleImputer(strategy='median')

categorical_features = [
    'Title',
    'Pclass',
    'Sex',
    'Embarked',
]

categorical_transformer = Pipeline(steps=[
    ('simpleimputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# creating preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

# Define the model
model = XGBClassifier(n_estimators=1000, learning_rate=0.1, use_label_encoder=False)

# Bundle preprocessing and modeling code in a pipeline

clf = Pipeline(steps=[
    ('name_title_encoder', NameTitleEncoder()),
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the model
clf.fit(X, y)



Pipeline(steps=[('name_title_encoder', NameTitleEncoder()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  SimpleImputer(strategy='median'),
                                                  ['Age', 'Fare']),
                                                 ('categorical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Title', 'Pclass', 'Sex',
                                                   'Embarked'])])),
                ('m...
                               importance_type='gain',
                          

Predicting y_values for X_predict, saving as csv for submitting.

In [5]:
from pathlib import Path

#check dir
path = '../output/titanic/'
Path(path).mkdir(parents=True, exist_ok=True)

survived = clf.predict(X_predict)

output = pd.DataFrame(
    {
    'PassengerId': X_predict.index,
    'Survived': survived})

output.to_csv(f'{path}titanic_xgboost.csv', index=False)
logging.info(f'Submission saved to {path}titanic_xgboost.csv')

INFO:root:Submission saved to ../output/titanic/titanic_xgboost.csv


Submission score only 0.73205. In tutorial with random RandomForestClassifier it was 0.77511 (bigger is better)
I see where I could improve, but for now my point was just to see what I remember from tutorial with XGBoost.