# Titanic Survival Prediction using Gradient Boosting  

In [1]:
# This notebook demonstrates how to use Gradient Boosting to predict survival on the Titanic dataset.

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split


# Load the dataset

In [2]:

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [4]:
train_data.shape


(891, 12)

In [5]:
test_data.shape

(418, 11)

In [7]:
# Check the first few rows of the training data
train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# Check the first few rows of the test data
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Preprocess the data

In [9]:

 
 # for indexing use the passenger id
train_data.set_index('PassengerId', inplace=True)

    

In [10]:
test_data.set_index('PassengerId', inplace=True)

In [11]:
y_train = train_data['Survived']


In [12]:
# Drop the 'Survived' column from the training data
X_train = train_data.drop(columns=['Survived'])


In [13]:
# append the test data to the training data
X_test = test_data.copy()


In [16]:
train_test = train_data._append(X_test)

 Combine the training and test data for preprocessing

In [17]:
print(train_test.shape)

(1309, 11)


In [18]:
train_test.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
train_test.value_counts("Survived")

Survived
0.0    549
1.0    342
Name: count, dtype: int64

In [22]:
# delete columns that are not used as features for training and prediction
columns_to_drop = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
train_test.drop(labels=columns_to_drop, axis=1, inplace=True)

In [24]:
# Convert categorical columns to numerical using one-hot encoding

dummmies__train_test = pd.get_dummies(train_test, columns=["Sex"], drop_first=True)
# drop_first=True will drop the first category to avoid multicollinearity

In [25]:
dummmies__train_test.shape

(1309, 4)

In [26]:
dummmies__train_test.head()

Unnamed: 0_level_0,Survived,Pclass,Fare,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,3,7.25,True
2,1.0,1,71.2833,False
3,1.0,3,7.925,False
4,1.0,1,53.1,False
5,0.0,3,8.05,True


In [29]:
dummmies__train_test.fillna(value=0.0, inplace=True)

In [31]:
# generate feature sets (X)
X_train = dummmies__train_test.values[0:891]
X_test = dummmies__train_test.values[891:]#replace the empty value in sex_

In [32]:
# to check the shape of the data
print(X_train.shape)
print(X_test.shape)


(891, 4)
(418, 4)


In [46]:
# transform data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

 min-max scaling is used to scale the data to a range of [0, 1]. This is useful for algorithms that are sensitive to the scale of the data, such as gradient boosting.

In [51]:
# split training feature and target sets into training and validation subsets
from sklearn.model_selection import train_test_split

X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = train_test_split(X_train_scale, y_train, random_state=42)


In [52]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [53]:

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train_sub, y_train_sub)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train_sub, y_train_sub)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation_sub, y_validation_sub)))
    print()

Learning rate:  0.05
Accuracy score (training): 1.000
Accuracy score (validation): 1.000

Learning rate:  0.1
Accuracy score (training): 1.000
Accuracy score (validation): 1.000

Learning rate:  0.25
Accuracy score (training): 1.000
Accuracy score (validation): 1.000

Learning rate:  0.5
Accuracy score (training): 1.000
Accuracy score (validation): 1.000

Learning rate:  0.75
Accuracy score (training): 1.000
Accuracy score (validation): 1.000

Learning rate:  1
Accuracy score (training): 1.000
Accuracy score (validation): 1.000



In [54]:
# Output confusion matrix and classification report of Gradient Boosting algorithm on validation set

gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train_sub, y_train_sub)
predictions = gb.predict(X_validation_sub)

print("Confusion Matrix:")
print(confusion_matrix(y_validation_sub, predictions))
print()
print("Classification Report")
print(classification_report(y_validation_sub, predictions))

Confusion Matrix:
[[134   0]
 [  0  89]]

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        89

    accuracy                           1.00       223
   macro avg       1.00      1.00      1.00       223
weighted avg       1.00      1.00      1.00       223



Thanks for reading this notebook., as data is small so it is overfitting, but you can try with larger datasets and more complex models.
it was just for ur reading and understanding purpose.