In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
# load data
train = pd.read_csv("titanic_train.csv")
test = pd.read_csv("titanic_test.csv")

In [29]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
train.info(), test.info()

In [31]:
# set "PassengerId" variable as index
train.set_index("PassengerId", inplace=True)
test.set_index("PassengerId", inplace=True)

In [32]:
train.shape

(891, 11)

In [33]:
# generate training target set (y_train)
y_train = train["Survived"]

In [34]:
# delete column "Survived" from train set
train.drop(labels="Survived", axis=1, inplace=True)

In [35]:
# shapes of train and test sets
train.shape, test.shape

((891, 10), (418, 10))

In [36]:
# join train and test sets to form a new train_test set
train_test =  train.append(test)

In [37]:
# delete columns that are not used as features for training and prediction
columns_to_drop = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
train_test.drop(labels=columns_to_drop, axis=1, inplace=True)

In [38]:
train_test.head()

Unnamed: 0_level_0,Pclass,Sex,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,male,7.25
2,1,female,71.2833
3,3,female,7.925
4,1,female,53.1
5,3,male,8.05


In [39]:
# convert objects to numbers by pandas.get_dummies
train_test_dummies = pd.get_dummies(train_test, columns=["Sex"])

In [40]:
train_test_dummies.head()

Unnamed: 0_level_0,Pclass,Fare,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,7.25,0,1
2,1,71.2833,1,0
3,3,7.925,1,0
4,1,53.1,1,0
5,3,8.05,0,1


In [41]:
# check the dimension
train_test_dummies.shape

(1309, 4)

In [42]:
train_test_dummies.isnull().sum()

Pclass        0
Fare          1
Sex_female    0
Sex_male      0
dtype: int64

In [43]:
# replace nulls with 0.0
train_test_dummies.fillna(value=0.0, inplace=True)

In [44]:
# generate feature sets (X)
X_train = train_test_dummies.values[0:891]
X_test = train_test_dummies.values[891:]

In [45]:
X_train.shape, X_test.shape

((891, 4), (418, 4))

In [46]:
# transform data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.fit_transform(X_test)

In [49]:
# split training feature and target sets into training and validation subsets
from sklearn.model_selection import train_test_split

X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = train_test_split(X_train_scale, y_train, random_state=0)

In [17]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [18]:
# train with Gradient Boosting algorithm
# compute the accuracy scores on train and validation sets when training with different learning rates

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train_sub, y_train_sub)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train_sub, y_train_sub)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation_sub, y_validation_sub)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.789
Accuracy score (validation): 0.780

Learning rate:  0.1
Accuracy score (training): 0.792
Accuracy score (validation): 0.780

Learning rate:  0.25
Accuracy score (training): 0.808
Accuracy score (validation): 0.807

Learning rate:  0.5
Accuracy score (training): 0.829
Accuracy score (validation): 0.830

Learning rate:  0.75
Accuracy score (training): 0.811
Accuracy score (validation): 0.780

Learning rate:  1
Accuracy score (training): 0.831
Accuracy score (validation): 0.780



In [19]:
# Output confusion matrix and classification report of Gradient Boosting algorithm on validation set

gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train_sub, y_train_sub)
predictions = gb.predict(X_validation_sub)

print("Confusion Matrix:")
print(confusion_matrix(y_validation_sub, predictions))
print()
print("Classification Report")
print(classification_report(y_validation_sub, predictions))

Confusion Matrix:
[[130   9]
 [ 29  55]]

Classification Report
             precision    recall  f1-score   support

          0       0.82      0.94      0.87       139
          1       0.86      0.65      0.74        84

avg / total       0.83      0.83      0.82       223



In [47]:
from xgboost import XGBClassifier

In [48]:
xgb=XGBClassifier()

In [51]:
xgb.fit(X_train_sub, y_train_sub)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [53]:
xgb.score(X_validation_sub, y_validation_sub)

0.8026905829596412