In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [2]:
train = pd.read_csv('train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


First feature selection: columns 'PassengerId', 'Name', 'Ticket', 'Cabin' and 'Embarked' will not be used in this work.

For sure 'PassengerId' doesn't say anything about the data, and there are enough missing data in 'Cabin' that we better avoid it. 'Embarked' also doesn't seem to add any relevant information.

'Name' could be used in the sense that people who share a family name might have had similar fate. 'Ticket' also does group people. 


 

In [3]:
# The family sage had a single ticket for all the 11 members (number of family members = 1 + SibSp + Parch):
train[train["Name"].str.contains("Sage,")].head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S


In [0]:
# Data preprocessing.
# 1) Convert the categorical column 'Sex' into nuerical values 0 and 1
lb = LabelBinarizer()
train['Sex'] = lb.fit_transform(train['Sex'])
# 2) Fill the empty data in the 'Age' data with its average:
av_age = train.Age.mean()
train.fillna(value={'Age':av_age}, inplace=True)

In [0]:
# Feature engineering: 2 new features.
# 1) This feature gives the price of the ticket per person, 
# which can then be compared among passengers:
train["Fare_indiv"] = train["Fare"] / ((1 + train["SibSp"] + train["Parch"]) * 10)
# 2) The number of members of the family and the age are combined beautifully in this feature.
# The first one is an integer, and the age divided by the maximum age in the dataset is a number between 0 and 1, 
# therefore with a single real number I have information about both features. 
train["Age"] = (1 + train["SibSp"] + train["Parch"]) + train["Age"] / 80

In [0]:
# More sophisticated methods involve modelling different genders separately. 
# Here we stick to a simpler approach.
X = train[['Sex', 'Age', 'Fare_indiv']]
Y = train["Survived"]

In [7]:
# The maximum value for the fare is over 500 pounds!
# Here we also see that even though these 3 registers share the same ticket, 
# they do not belong to the same family, their surnames are different and 
# the values of SibSp and Parch do not match.
# This I don't fully understand, to be honest...
train[train["Fare"] == train["Fare"].values.max()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_indiv
258,259,1,1,"Ward, Miss. Anna",0,1.4375,0,0,PC 17755,512.3292,,C,51.23292
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",1,2.45,0,1,PC 17755,512.3292,B51 B53 B55,C,25.61646
737,738,1,1,"Lesurer, Mr. Gustave J",1,1.4375,0,0,PC 17755,512.3292,B101,C,51.23292


#Functions to compute different scores

In [0]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        acc = accuracy_score(y_train, clf.predict(X_train))
        cr = classification_report(y_train, clf.predict(X_train))
        cm = confusion_matrix(y_train, clf.predict(X_train))
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(acc))
        #print("Classification Report: \n {}\n".format(cr))
        #print("Confusion Matrix: \n {}\n".format(cm))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(res.mean()))
        print("Accuracy SD: \t\t {0:.4f}".format(res.std()))

        return [acc, res.mean(), res.std()]
        
    elif train==False:
        '''
        test performance
        '''
        acc = accuracy_score(y_test, clf.predict(X_test))
        cr = classification_report(y_test, clf.predict(X_test))
        cm = confusion_matrix(y_test, clf.predict(X_test))
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(acc))
        #print("Classification Report: \n {}\n".format(cr))
        #print("Confusion Matrix: \n {}\n".format(cm))

        return acc

#Decision Tree

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split (X,Y,test_size=0.1,random_state=42)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [10]:
res = print_score(dt, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(dt, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'Decision Tree')
print(res)
results = []
results.append(res)

Train Result:

accuracy score: 0.9800

Average Accuracy: 	 0.7553
Accuracy SD: 		 0.0312

------------------------------

Test Result:

accuracy score: 0.7667


------------------------------

['Decision Tree', 0.9800249687890137, 0.7553086419753087, 0.031189275706037394, 0.7666666666666667]


#Random Forest

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split (X,Y,test_size=0.1,random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
res = print_score(rf, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(rf, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'Random Forest')
print(res)
results.append(res)

Train Result:

accuracy score: 0.9800

Average Accuracy: 	 0.7977
Accuracy SD: 		 0.0447

------------------------------

Test Result:

accuracy score: 0.8111


------------------------------

['Random Forest', 0.9800249687890137, 0.7977314814814813, 0.04473451349384348, 0.8111111111111111]


#Random Forest + Grid Search

In [0]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [0]:
rf_clf = RandomForestClassifier(n_estimators=100,random_state=42)

In [0]:
#params_grid = {"max_depth": [3, None],
#               "min_samples_split": [2, 3, 10],
#               "min_samples_leaf": [1, 3, 10],
#               "bootstrap": [True, False],
#               "criterion": ['gini', 'entropy']}
params_grid = {"max_depth": [3, 10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],
               "n_estimators": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 25, 40, 50, 100],
               "min_samples_split": [2, 3, 4, 5, 8, 10, 12, 15, 20, 30],
               "min_samples_leaf": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 30],
               "bootstrap": [True, False],
               "criterion": ['gini', 'entropy']}

In [0]:
#gs = GridSearchCV (rf_clf, params_grid, n_jobs=-1, cv=5, verbose=1, scoring='accuracy')
gs = RandomizedSearchCV(rf_clf,params_grid,random_state=0,verbose=0,n_jobs=-1,n_iter=20,cv=10)

In [17]:
gs.fit(X_train,Y_train)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
              

In [18]:
gs.best_score_

0.812716049382716

In [0]:
optimized_parameters = gs.best_estimator_.get_params()

In [20]:
res = print_score(gs, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(gs, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'Optimized Random Forest')
print(res)
results.append(res)

Train Result:

accuracy score: 0.9513

Average Accuracy: 	 0.7890
Accuracy SD: 		 0.0435

------------------------------

Test Result:

accuracy score: 0.8444


------------------------------

['Optimized Random Forest', 0.951310861423221, 0.7890123456790124, 0.043462927456628324, 0.8444444444444444]


#Extra Trees

In [0]:
from sklearn.ensemble import ExtraTreesClassifier

In [0]:
et = ExtraTreesClassifier (random_state=42, n_estimators=100)

In [23]:
et.fit(X_train,Y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=42, verbose=0,
                     warm_start=False)

In [24]:
res = print_score(et, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(et, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'Extra Trees')
print(res)
results.append(res)

Train Result:

accuracy score: 0.9800

Average Accuracy: 	 0.7802
Accuracy SD: 		 0.0428

------------------------------

Test Result:

accuracy score: 0.8000


------------------------------

['Extra Trees', 0.9800249687890137, 0.7802314814814815, 0.042828478252007945, 0.8]


In [25]:
print(results)

[['Decision Tree', 0.9800249687890137, 0.7553086419753087, 0.031189275706037394, 0.7666666666666667], ['Random Forest', 0.9800249687890137, 0.7977314814814813, 0.04473451349384348, 0.8111111111111111], ['Optimized Random Forest', 0.951310861423221, 0.7890123456790124, 0.043462927456628324, 0.8444444444444444], ['Extra Trees', 0.9800249687890137, 0.7802314814814815, 0.042828478252007945, 0.8]]


## AdaBoost

In [0]:
from sklearn.ensemble import AdaBoostClassifier

In [0]:
ada = AdaBoostClassifier()

In [28]:
ada.fit(X_train, Y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [29]:
res = print_score(ada, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(ada, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'AdaBoost')
print(res)
results.append(res)

Train Result:

accuracy score: 0.8402

Average Accuracy: 	 0.7877
Accuracy SD: 		 0.0582

------------------------------

Test Result:

accuracy score: 0.8444


------------------------------

['AdaBoost', 0.8401997503121099, 0.7877314814814815, 0.05819842203325183, 0.8444444444444444]


## AdaBoost with Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
ada_rf = AdaBoostClassifier(RandomForestClassifier())

In [32]:
ada_rf.fit(X_train, Y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=RandomForestClassifier(bootstrap=True,
                                                         ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features='auto',
                                                         max_leaf_nodes=None,
                                                         max_samples=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                       

In [33]:
res = print_score(ada_rf, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(ada_rf, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'AdaBoost + Random Forest')
print(res)
results.append(res)

Train Result:

accuracy score: 0.9800

Average Accuracy: 	 0.7777
Accuracy SD: 		 0.0467

------------------------------

Test Result:

accuracy score: 0.8333


------------------------------

['AdaBoost + Random Forest', 0.9800249687890137, 0.7776851851851853, 0.04666005244132461, 0.8333333333333334]


##XGBoost

In [0]:
import xgboost as xgb

In [0]:
xgb_clf = xgb.XGBClassifier(max_depth=5, n_estimators=10000, learning_rate=0.3, n_jobs=-1)

In [36]:
xgb_clf.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [37]:
res = print_score(xgb_clf, X_train, Y_train, X_test, Y_test, train=True)
print('\n------------------------------\n')
rr = print_score(xgb_clf, X_train, Y_train, X_test, Y_test, train=False)
res.append(rr)
print('\n------------------------------\n')
res.insert(0,'XGBoost')
print(res)
results.append(res)

Train Result:

accuracy score: 0.9800

Average Accuracy: 	 0.7777
Accuracy SD: 		 0.0589

------------------------------

Test Result:

accuracy score: 0.8222


------------------------------

['XGBoost', 0.9800249687890137, 0.777746913580247, 0.05894200376604195, 0.8222222222222222]


#Print results

In [38]:
df_results = pd.DataFrame.from_records(results)
df_results.columns = ['Classifier', 'Train accuracy score', 'Mean accuracy score', 'Standard deviation', 'Test accuracy score']
df_results.head(10)

Unnamed: 0,Classifier,Train accuracy score,Mean accuracy score,Standard deviation,Test accuracy score
0,Decision Tree,0.980025,0.755309,0.031189,0.766667
1,Random Forest,0.980025,0.797731,0.044735,0.811111
2,Optimized Random Forest,0.951311,0.789012,0.043463,0.844444
3,Extra Trees,0.980025,0.780231,0.042828,0.8
4,AdaBoost,0.8402,0.787731,0.058198,0.844444
5,AdaBoost + Random Forest,0.980025,0.777685,0.04666,0.833333
6,XGBoost,0.980025,0.777747,0.058942,0.822222


## Kaggle test

In [0]:
test = pd.read_csv('test.csv')

# Data preprocessing.
# 1) Binarization of the Sex column:
lb = LabelBinarizer()
test['Sex'] = lb.fit_transform(test['Sex'])
# 2) fille the age aps with the averge (from the training set!):
test.fillna(value={'Age':av_age}, inplace=True)

# Feature engineering:
test["Fare_indiv"] = test["Fare"] / ((1 + test["SibSp"] + test["Parch"]) * 10)
test["Age"] = (1 + test["SibSp"] + test["Parch"]) + test["Age"] / 80

In [40]:
# There is 1 missing value for the fares:
test[pd.isna(test['Fare'])]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_indiv
152,1044,3,"Storey, Mr. Thomas",1,1.75625,0,0,3701,,,S,


In [41]:
# We'll assign the average value (from the train dataset!) of the ticket for the 3rd class:
train_pclass3 = train[train['Pclass'] == 3]
av_ticket = train['Fare_indiv'].mean()
print(av_ticket)
test.fillna(value={'Fare_indiv':av_ticket}, inplace=True)

1.9916375111917584


In [45]:
# Feature selection:
Xtest = test[['Sex', 'Age', 'Fare_indiv']]
Xtest.describe()

Unnamed: 0,Sex,Age,Fare_indiv
count,418.0,418.0,418.0
mean,0.636364,2.216645,2.179964
std,0.481622,1.515415,3.559713
min,0.0,1.15,0.0
25%,0.0,1.3625,0.76344
50%,1.0,1.45,0.86625
75%,1.0,2.596875,2.598281
max,1.0,11.371239,26.2375


In [0]:
# We use the random forest with the optimized hyperparateres as our model for prediction.
krf = RandomForestClassifier(**optimized_parameters)
krf.fit(X,Y)
Ytest = krf.predict(Xtest)
test['Survived'] = Ytest
test[['PassengerId', 'Survived']].to_csv("submission22.csv", index=False)

With this notebook I could score 0.77511 in Kaggle