# Kaggle Capstone Project (boosted decision tree ensemble)
Project located at https://www.kaggle.com/c/datasciencedojo-capstone/overview

In [1]:
import pandas as pd
titanic = pd.read_csv('/Datasets/Titanic/train.csv')
# The 'Survived' column only exists in the train data set, so it is first treated here
titanic['Survived'] = pd.Categorical(titanic.Survived)
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    object  
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    object  
dtypes: category(1), float64(2), int64(4), object(5)
memory usage: 77.7+ KB


In [2]:
# Function to clean both train and test data
def treatData(titanic):
    titanic = titanic.drop(['Name', 'Ticket', 'Cabin', 'Fare', 'PassengerId'], axis=1)
    titanic['Age'].fillna((titanic['Age'].median()), inplace=True)
    mode = titanic['Embarked'].value_counts().index[0]
    titanic.loc[pd.isnull(titanic['Embarked']), 'Embarked'] = mode
    titanic['Pclass'] = pd.Categorical(titanic.Pclass)
    titanic['Embarked'] = pd.Categorical(titanic.Embarked)
    titanic['Sex'] = pd.Categorical(titanic.Sex)
    titanic.info()
    return titanic

In [3]:
# Prepping test data
test_set = pd.read_csv('/Datasets/Titanic/test.csv')
test_treated = treatData(test_set)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Pclass    418 non-null    category
 1   Sex       418 non-null    category
 2   Age       418 non-null    float64 
 3   SibSp     418 non-null    int64   
 4   Parch     418 non-null    int64   
 5   Embarked  418 non-null    category
dtypes: category(3), float64(1), int64(2)
memory usage: 11.4 KB


In [4]:
# Prepping train data
titanic_treated = treatData(titanic)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    category
 1   Pclass    891 non-null    category
 2   Sex       891 non-null    category
 3   Age       891 non-null    float64 
 4   SibSp     891 non-null    int64   
 5   Parch     891 non-null    int64   
 6   Embarked  891 non-null    category
dtypes: category(4), float64(1), int64(2)
memory usage: 24.9 KB


In [5]:
# Boosting ensemble model setup
titanic_survived_Y = titanic_treated['Survived']
titanic_X = pd.get_dummies(titanic_treated.loc[:, ~titanic_treated.columns.isin(['Survived'])])

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
titanic_gbt_clf = GradientBoostingClassifier(loss='exponential', learning_rate=0.1,
                                             n_estimators=100, subsample=1.0, min_samples_split=2,
                                             min_samples_leaf=1, max_depth=3)
titanic_gbt_clf = titanic_gbt_clf.fit(titanic_X, titanic_survived_Y)

In [7]:
# Predicting 'Survived' using current model
titanic_gbt_pred = titanic_gbt_clf.predict(pd.get_dummies(test_treated))

In [8]:
# Export csv file to submit to Kaggle
prediction = pd.DataFrame({'Survived':titanic_gbt_pred}).to_csv('prediction.csv')