In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Plotting
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style

# Sklearn Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

#Filter unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Produce results that are reproducible
np.random.seed(10)

In [None]:
#Load the dataset
titanic = pd.read_csv("/kaggle/input/titanic/train.csv")
titanic_test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
#Shapes of the dataset
print(f"Shape of training data : {titanic.shape}")
print(f"Shape of test data : {titanic_test.shape}")

# **Exploratory Data Analysis**

In [None]:
titanic.head()

In [None]:
titanic.info()


The training-set has 891 training examples and 11 features + the target variable (survived). 2 of the features are floats, 5 are integers and 5 are objects.

In [None]:
titanic.describe()

In [None]:
titanic.isnull().sum()

Notice that the age and the cabin features have a large number of missing values. The Embarked feature has only two missing values which we can easily be filled/dropped since it is only but a small number.

In [None]:
#Features conributing to the survival rate
#Naturally everything except ‘PassengerId’, ‘Ticket’ and ‘Name’ would be correlated with the survival rate.
corr = titanic.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

Fare seems to have the highest positive correlation with Survived while Pclass the strongest negative correlation with Survived. Age and Subsp features also show some negative correlation while PassengerId no  correlation at all. (Out of the numeric features)

In [None]:
#Embarked pclass and sex
FacetGrid = sns.FacetGrid(titanic, row='Embarked', size=4.5, aspect=1.6)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette=None,  order=None, hue_order=None )
FacetGrid.add_legend()

Embarked seems to be correlated with survival, depending on the gender. Women on port S and Q have a higher chance of survival. Men have a high survival probability if they are on port C, but a low probability if they are on port Q or S.

In [None]:
#Further investigation on pclass
sns.barplot(x='Pclass', y='Survived', data=titanic)

Class 1 has a higher probability of surival as opposed to Class 3

In [None]:
#is the dataset balanced ?
sns.countplot(x='Survived', data=titanic) #countplot shows the countof observations in each categorical bin

Around 320 survived and about 540 did not survive so the dataset is slightly imbalanced

In [None]:
titanic_test.head()

# **Data Preprocessing**

In [None]:
#Dropping features not useful in the prediction and preparing the X and y .
#Deletion of the "Cabin" feature due to large number f missing values
y = titanic['Survived']
X= titanic.copy()
X = X.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'],axis=1)
titanic_test = titanic_test.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],axis=1)


#Dealing with Age and Embarked missing values
data = [X, titanic_test]

for dataset in data:
    mean = X["Age"].mean()
    std = titanic_test["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = X["Age"].astype(int)
    
X["Age"].isnull().sum()

In [None]:
X.isnull().sum()

In [None]:
titanic_test.isnull().sum()

In [None]:
print(type(X))

In [None]:
X['Embarked'].describe()

In [None]:
#Dealing with the missing Embarked value
#Most common value is S
common_value = 'S'
X['Embarked'] =X['Embarked'].fillna(common_value)
X.isnull().sum()

In [None]:
#Dealing with the missing fare value in titanic_test
titanic_test["Fare"] = titanic_test["Fare"].fillna(value=titanic_test["Fare"].median())
titanic_test.isnull().sum()

In [None]:
#Encode the X since we are using sklearn implementation of RandomForest and XGBoost
X_encoded = pd.get_dummies(X, drop_first=True)


In [None]:
#Encode the titanic_test since we are using sklearn implementation of RandomForest and XGBoost
titanic_test_encoded = pd.get_dummies(titanic_test, drop_first=True)

In [None]:
X_encoded.columns

In [None]:
titanic_test_encoded.columns

In [None]:
#Split the X into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=101)
X_train.head()

In [None]:
X_test.head()

# **Models**

In [None]:
#Stochastic Gradient Descent (SGD):

sgd = linear_model.SGDClassifier(max_iter=10, tol=None)
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
sgd.score(X_train, y_train)
acc_sgd = round(sgd.score(X_train, y_train) * 100, 2)

#Random Forest:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)

#Extreme Gradient Boosting:
xgb = XGBClassifier( max_depth= 100, n_estimators= 500, learning_rate=0.29, random_state= 42, n_jobs=5)
xgb.fit(X_train, y_train)
Y_pred = xgb.predict(X_test)
acc_xgb = round(xgb.score(X_train, y_train) * 100, 2)

#K Nearest Neighbor:

# KNN 
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, y_train) * 100, 2)

#Gaussian Naive Bayes:

gaussian = GaussianNB() 
gaussian.fit(X_train, y_train)  
Y_pred = gaussian.predict(X_test)  
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)

#Perceptron:

perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)

#Linear Support Vector Machine:
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)

#Decision Tree

decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)

#Which is the best Model ?

results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'XGB' ,
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_xgb, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

In [None]:
#K-Fold Cross Validation for RandomForest
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100, oob_score = True)
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

The RandomForest Model has an average accuracy of 80% with a std deviation of 4

In [None]:
#K-Fold Cross Validation for XGB
from sklearn.model_selection import cross_val_score
xgb = XGBClassifier( max_depth= 100, n_estimators= 500, learning_rate=0.29, random_state= 0, n_jobs=5, eval_metric='logloss')
scores = cross_val_score(xgb, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

XGBoost has an average accuracy of 79%

In [None]:
#Feature importance using randomforest
#Random Forest:
random_forest = RandomForestClassifier(n_estimators=100, oob_score=True)
random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(15)

In [None]:
print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

# **Hyperparameter Tuning RandomForest**

In [None]:
param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10, 25, 50, 70], "min_samples_split" : [2, 4, 10, 12, 16, 18, 25, 35], "n_estimators": [100, 400, 700, 1000, 1500]}
from sklearn.model_selection import GridSearchCV, cross_val_score
rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, y_train)


In [None]:
clf.best_params_

In [None]:
#Test New Parameters
# Random Forest
random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 10,   
                                       n_estimators=400, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)

random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

# **Model Evaluation RandomForest**

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(random_forest, X_train, y_train, cv=3)
confusion_matrix(y_train, predictions)

In [None]:
#Precision and Recall
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_train, predictions))
print("Recall:",recall_score(y_train, predictions))

The model predicts 77% of the time, a passengers survival correctly (precision). The recall tells us that it predicted the survival of  66% of the people who actually survived.

In [None]:
#ROC AUC
from sklearn.metrics import precision_recall_curve

# getting the probabilities of our predictions
y_scores = random_forest.predict_proba(X_train)
y_scores = y_scores[:,1]

from sklearn.metrics import roc_auc_score
r_a_score = roc_auc_score(y_train, y_scores)
print("ROC-AUC-Score:", r_a_score)

# **Hyperparameter Tuning XGBoost**

In [None]:
param_grid = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35 ] ,
 "max_depth"        : [ 20, 50, 100, 150, 200],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ] }
from sklearn.model_selection import GridSearchCV, cross_val_score
xgb = XGBClassifier( max_depth= 100, n_estimators= 500, learning_rate=0.29, random_state= 0, n_jobs=5, eval_metric='logloss')
clf = GridSearchCV(estimator=xgb, param_grid=param_grid, n_jobs=10)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
#Test New Parameters
# XG Boost
xgb = XGBClassifier(gamma = 0.0, 
                    learning_rate = 0.15,
                    max_depth = 20,
                    min_child_weight = 7 ,
                    random_state=1, 
                    n_jobs=5)

xgb.fit(X_train, y_train)
Y_prediction = xgb.predict(X_test)

xgb.score(X_train, y_train)
acc_xgb = round(xgb.score(X_train, y_train) * 100, 2)

# **Model Evaluation XGBoost**

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(xgb, X_train, y_train, cv=3)
confusion_matrix(y_train, predictions)

In [None]:
#Precision and Recall
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_train, predictions))
print("Recall:",recall_score(y_train, predictions))

In [None]:
#ROC AUC
from sklearn.metrics import precision_recall_curve

# getting the probabilities of our predictions
y_scores = xgb.predict_proba(X_train)
y_scores = y_scores[:,1]

from sklearn.metrics import roc_auc_score
r_a_score = roc_auc_score(y_train, y_scores)
print("ROC-AUC-Score:", r_a_score)

# **Predictions on the test.Csv**

In [None]:
#Prediction using RandomForest
#Actual prediction
y_pred1 = random_forest.predict(titanic_test_encoded)
print(y_pred1)

In [None]:
#Prediction using XGBoost
#Actual prediction
y_pred2 = xgb.predict(titanic_test_encoded)
print(y_pred2)

In [None]:
titanic_test_original = pd.read_csv("/kaggle/input/titanic/test.csv")
output = pd.DataFrame({'PassengerId' : titanic_test_original.PassengerId, 'Survived': y_pred2})
output.to_csv('submission.csv', index=False)