# 1. Importing libraries and loading datasets

In [1]:
import numpy as np
import pandas as pd

# Modelling
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB

# KNeighbors
from sklearn.neighbors import KNeighborsClassifier

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Random Forest
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

# 2. Explore data

In [3]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
print("Columns: \n{0} ".format(train_data.columns.tolist()))

Columns: 
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] 


# 3. Basic data check

## Missing values

In [6]:
missing_values = train_data.isna().any()
print('Columns which have missing values: \n{0}'.format(missing_values[missing_values == True].index.tolist()))

Columns which have missing values: 
['Age', 'Cabin', 'Embarked']


In [7]:
print("Percentage of missing values in `Age` column: {0:.2f}".format(100.*(train_data.Age.isna().sum()/len(train_data))))
print("Percentage of missing values in `Cabin` column: {0:.2f}".format(100.*(train_data.Cabin.isna().sum()/len(train_data))))
print("Percentage of missing values in `Embarked` column: {0:.2f}".format(100.*(train_data.Embarked.isna().sum()/len(train_data))))

Percentage of missing values in `Age` column: 19.87
Percentage of missing values in `Cabin` column: 77.10
Percentage of missing values in `Embarked` column: 0.22


## Check for duplicates

In [8]:
duplicates = train_data.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates))

Duplicates in train data: 0


## Categorical variables

In [9]:
categorical = train_data.nunique().sort_values(ascending=True)
print('Categorical variables in train data: \n{0}'.format(categorical))

Categorical variables in train data: 
Survived         2
Sex              2
Pclass           3
Embarked         3
SibSp            7
Parch            7
Age             88
Cabin          147
Fare           248
Ticket         681
PassengerId    891
Name           891
dtype: int64


# 4. Data cleaning

In [10]:
def clean_data(data):
    # Too many missing values
    data.drop(['Cabin'], axis=1, inplace=True)
    
    # Probably will not provide some useful information
    data.drop(['Name', 'Ticket', 'Fare', 'Embarked'], axis=1, inplace=True)
    
    return data
    
train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [11]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
886,887,0,2,male,27.0,0,0
887,888,1,1,female,19.0,0,0
888,889,0,3,female,,1,2
889,890,1,1,male,26.0,0,0
890,891,0,3,male,32.0,0,0


# 5. Feature engineering

Although I have eliminated most of the columns for simplicity, in the future I am planning to recover those columns. They may contain some useful information.  
For now encoding the `Sex` column and filling `Age` column is enough to run a model.

In [12]:
train_data['Sex'].replace({'male':0, 'female':1}, inplace=True)
test_data['Sex'].replace({'male':0, 'female':1}, inplace=True)

# Merge two data to get the average Age and fill the column
all_data = pd.concat([train_data, test_data])
average = all_data.Age.median()
print("Average Age: {0}".format(average))
train_data.fillna(value={'Age': average}, inplace=True)
test_data.fillna(value={'Age': average}, inplace=True)

Average Age: 28.0


In [13]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
886,887,0,2,0,27.0,0,0
887,888,1,1,1,19.0,0,0
888,889,0,3,1,28.0,1,2
889,890,1,1,0,26.0,0,0
890,891,0,3,0,32.0,0,0


# 6. Modelling

Try different models with different parameters to understand which models give better results.

In [14]:
# Set X and y
X = train_data.drop(['Survived', 'PassengerId'], axis=1)
y = train_data['Survived']
test_X = test_data.drop(['PassengerId'], axis=1)

In [15]:
# To store models created
best_models = {}

# Split data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

def print_best_parameters(hyperparameters, best_parameters):
    value = "Best parameters: "
    for key in hyperparameters:
        value += str(key) + ": " + str(best_parameters[key]) + ", "
    if hyperparameters:
        print(value[:-2])

def get_best_model(estimator, hyperparameters):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=estimator, param_grid=hyperparameters, n_jobs=-1, cv=cv, scoring="accuracy")
    best_model = grid_search.fit(train_X, train_y)
    best_parameters = best_model.best_estimator_.get_params()
    print_best_parameters(hyperparameters, best_parameters)
    return best_model

def evaluate_model(model, name):
    print("Accuracy score:", accuracy_score(train_y, model.predict(train_X)))
    best_models[name] = model

In [16]:
print("Features: \n{0} ".format(X.columns.tolist()))

Features: 
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'] 


## [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

Tune the logistic regression model by changing some of its parameters.

Logistic regression parameters:  

* **solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’**  
    * Algorithm to use in the optimization problem. Default is ‘lbfgs’. To choose a solver, you might want to consider the following aspects:
        * For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones;
        * For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss;
        * ‘liblinear’ is limited to one-versus-rest schemes.

> **Warning**  
> The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:  
> * ‘newton-cg’ - [‘l2’, ‘none’]  
> * ‘lbfgs’ - [‘l2’, ‘none’]  
> * ‘liblinear’ - [‘l1’, ‘l2’]  
> * ‘sag’ - [‘l2’, ‘none’]  
> * ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]  

* **penalty: {‘l1’, ‘l2’, ‘elasticnet’, ‘none’}, default=’l2’**  
    * Specify the norm of the penalty:
        * 'none': no penalty is added;
        * 'l2': add a L2 penalty term and it is the default choice;
        * 'l1': add a L1 penalty term;
        * 'elasticnet': both L1 and L2 penalty terms are added.

> **Warning**  
> Some penalties may not work with some solvers. See the parameter solver below, to know the compatibility between the penalty and solver. 

* **C: float, default=1.0**  
    Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.


In [17]:
# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
hyperparameters = {
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty' : ['l2'],
    'C'       : [100, 10, 1.0, 0.1, 0.01]
}
estimator = LogisticRegression(random_state=1)
best_model_logistic = get_best_model(estimator, hyperparameters)

Best parameters: solver: liblinear, penalty: l2, C: 0.1


In [18]:
evaluate_model(best_model_logistic.best_estimator_, 'logistic')

Accuracy score: 0.8083832335329342


## [Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)


In [19]:
# Not sure which parameters to play with :)
hyperparameters = {}

### [Gaussian Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes)

In [20]:
evaluate_model(get_best_model(GaussianNB(), hyperparameters).best_estimator_, 'gaussian_nb')

Accuracy score: 0.7874251497005988


### [Multinomial Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes)

In [21]:
evaluate_model(get_best_model(MultinomialNB(), hyperparameters).best_estimator_, 'multinominal_nb')

Accuracy score: 0.7904191616766467


### [Complement Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html#complement-naive-bayes)

In [22]:
evaluate_model(get_best_model(ComplementNB(), hyperparameters).best_estimator_, 'complement_nb')

Accuracy score: 0.7889221556886228


### [Bernoulli Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html#bernoulli-naive-bayes)

In [23]:
evaluate_model(get_best_model(BernoulliNB(), hyperparameters).best_estimator_, 'bernoulli_nb')

Accuracy score: 0.7874251497005988


## [K-nearest neighbors](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

Tune k-nearest neighbors model by changing some of its parameters.

* **n_neighbors: int, default=5**  
    Number of neighbors to use by default for kneighbors queries.


* **weights: {‘uniform’, ‘distance’} or callable, default=’uniform’**  
    * Weight function used in prediction. Possible values:
        * ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
        * ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.
        * [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.


* **algorithm: {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’**  
    * Algorithm used to compute the nearest neighbors:  
        * ‘ball_tree’ will use BallTree
        * ‘kd_tree’ will use KDTree
        * ‘brute’ will use a brute-force search.
        * ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
        
> Note: fitting on sparse input will override the setting of this parameter, using brute force.


* **leaf_size: int, default=30**  
    Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.
    
* **p: int, default=2**  
    Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

* **n_neighbors: int, default=5**  
    Number of neighbors to use by default for kneighbors queries.

In [24]:
# https://medium.datadriveninvestor.com/k-nearest-neighbors-in-python-hyperparameters-tuning-716734bc557f
hyperparameters = {
    'n_neighbors' : list(range(1,5)),
    'weights'     : ['uniform', 'distance'],
    'algorithm'   : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size'   : list(range(1,10)),
    'p'           : [1,2]
}
estimator = KNeighborsClassifier()
best_model_kneighbors = get_best_model(estimator, hyperparameters)

Best parameters: n_neighbors: 3, weights: uniform, algorithm: brute, leaf_size: 1, p: 1


In [25]:
evaluate_model(best_model_kneighbors.best_estimator_, 'kneighbors')

Accuracy score: 0.8802395209580839


## [Decision Tree Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

Tune decision tree classifier model by changing some of its parameters.

* **criterion: {“gini”, “entropy”}, default=”gini”**  
    The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

* **splitter: {“best”, “random”}, default=”best”**  
    The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

* **max_depth: int, default=None**  
    The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
    

* **min_samples_split: int or float, default=2**  
    * The minimum number of samples required to split an internal node:
        * If int, then consider min_samples_split as the minimum number.
        * If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.


* **min_samples_leaf: int or float, default=1**  
   The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.  
    * If int, then consider min_samples_leaf as the minimum number.
    * If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.

In [26]:
# https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680
# https://www.kaggle.com/gauravduttakiit/hyperparameter-tuning-in-decision-trees
hyperparameters = {
    'criterion'         : ['gini', 'entropy'],
    'splitter'          : ['best', 'random'],
    'max_depth'         : [None, 1, 2, 3, 4, 5],
    'min_samples_split' : list(range(2,5)),
    'min_samples_leaf'  : list(range(1,5))
}
estimator = DecisionTreeClassifier(random_state=1)
best_model_decision_tree = get_best_model(estimator, hyperparameters)

Best parameters: criterion: gini, splitter: best, max_depth: 4, min_samples_split: 2, min_samples_leaf: 3


In [27]:
evaluate_model(best_model_decision_tree.best_estimator_, 'decision_tree')

Accuracy score: 0.8502994011976048


## [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

* **n_estimators: int, default=100**  
    The number of trees in the forest.


* **max_features: {“auto”, “sqrt”, “log2”}, int or float, default=”auto”**  
    * The number of features to consider when looking for the best split:
        * If int, then consider max_features features at each split.
        * If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
        * If “auto”, then max_features=sqrt(n_features).
        * If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
        * If “log2”, then max_features=log2(n_features).
        * If None, then max_features=n_features.

> Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.

* **criterion: {“gini”, “entropy”}, default=”gini”**  
    The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

* **max_depth: int, default=None**  
    The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
    
    
* **min_samples_split: int or float, default=2**  
    * The minimum number of samples required to split an internal node:
        * If int, then consider min_samples_split as the minimum number.
        * If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.


* **min_samples_leaf: int or float, default=1**  
    The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.  
     * If int, then consider min_samples_leaf as the minimum number.
     * If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.

In [28]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# https://www.analyticsvidhya.com/blog/2020/03/beginners-guide-random-forest-hyperparameter-tuning/
hyperparameters = {
    'n_estimators'      : list(range(10, 50, 10)),
    'max_features'      : ['auto', 'sqrt', 'log2'],
    'criterion'         : ['gini', 'entropy'],
    'max_depth'         : [None, 1, 2, 3, 4, 5],
    'min_samples_split' : list(range(2,5)),
    'min_samples_leaf'  : list(range(1,5))
}
estimator = RandomForestClassifier(random_state=1)
best_model_random_forest = get_best_model(estimator, hyperparameters)

Best parameters: n_estimators: 20, max_features: auto, criterion: gini, max_depth: 4, min_samples_split: 2, min_samples_leaf: 3


In [29]:
evaluate_model(best_model_random_forest.best_estimator_, 'random_forest')

Accuracy score: 0.8502994011976048


# WORK IN PROGRESS

# 7. Submission

In [30]:
# Get predictions for each model and create submission files
for model in best_models:
    predictions = best_models[model].predict(test_X)
    output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
    output.to_csv('submission_' + model + '.csv', index=False)