In [48]:
import numpy as np
import pandas as pd

In [49]:
df=pd.read_csv('titanic_train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [50]:
X=df.drop(['Survived'], axis=1)
y=df['Survived']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=31)

In [52]:
y.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [53]:
num_features=['Age','SibSp','Fare','Parch']
cat_features=['Sex','Embarked','Pclass']

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [55]:
num_pipeline=Pipeline(
    steps=[
        ('num_imputer',SimpleImputer()),
        ('scaler',StandardScaler()),
        
    ]

)

In [56]:
cat_pipeline=Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]

)

In [57]:
from sklearn.compose import ColumnTransformer
preprocessor= ColumnTransformer(
transformers=[
    ('num_pipeline', num_pipeline,num_features),
    ('cat_pipeline',cat_pipeline, cat_features)
]

)

In [58]:
from sklearn.tree import DecisionTreeClassifier

tree_final_pipeline=Pipeline(
steps=[
    ('preprocessor', preprocessor),
    ('tree_clf',DecisionTreeClassifier())
]

)

In [59]:
from sklearn.model_selection import GridSearchCV

# set up the values of hyperparameters you want to evaluate
# IMPORTANT!!!!!!!
# here you must use the step names as the prefix followed by two under_scores to sepecify the parameter names
# you also need to specify the "full path" of the steps
# as you can see we can even grid search parameters for preprocessing pipeline step
param_grid = [
    {
        'preprocessor__num_pipeline__num_imputer__strategy': ['mean', 'median'],
        'tree_clf__criterion': ['gini', 'entropy'], 
        'tree_clf__max_depth': [3, 4, 5, 6, 7],
    }
]

# set up the grid search 
grid_search = GridSearchCV(tree_final_pipeline, param_grid, cv=5,
                          scoring='accuracy',
                          return_train_score=True)

# train the model using the full pipeline
grid_search.fit(X_train, y_train)


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [60]:
grid_search.best_params_

{'preprocessor__num_pipeline__num_imputer__strategy': 'median',
 'tree_clf__criterion': 'entropy',
 'tree_clf__max_depth': 6}

In [61]:


tree_clf_best = grid_search.best_estimator_
tree_clf_best



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('num_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                

In [62]:
# To predict on new data: simply calling the predict method 
# the full pipeline steps will be applied to the testing set followed by the prediction
y_pred = tree_clf_best.predict(X_test)

# calculate accuracy, precision, recall, f1-score
# Note: y_test is the ground truth for the tesing set
# we have similiar score for the testing set as the cross validation score - good
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print(f'Accuracy Score : {accuracy_score(y_test,y_pred)}')
print(f'Precision Score : {precision_score(y_test,y_pred)}')
print(f'Recall Score : {recall_score(y_test,y_pred)}')
print(f'F1 Score : {f1_score(y_test,y_pred)}')

Accuracy Score : 0.7653631284916201
Precision Score : 0.8958333333333334
Recall Score : 0.5375
F1 Score : 0.671875


In [65]:
titanic_test = pd.read_csv('titanic_test.csv')

y_pred_titanic = tree_clf_best.predict(titanic_test)

# combine id and prediction for kaggle submission
tree_full_pipeline_submit = pd.DataFrame({
    'PassengerId': titanic_test['PassengerId'],
    'Survived': y_pred_titanic
})

# generate the csv
tree_full_pipeline_submit.to_csv('tree-full-pipeline-submit.csv', index=False)
print('csv saved! please submit the prediction csv to Kaggle.com')

csv saved! please submit the prediction csv to Kaggle.com
