In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import * #Classification
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
alldata = pd.concat([train, test],axis=0)

# Missing value completion

### For aiming more high accuracy, using "title"(s.t. Sir. Mr. Ms. etc) from column Name and add as new feature. 


In [3]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
    
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [4]:
alldata['Title'] = alldata['Name'].map(lambda x: get_title(x))
alldata['Title'] = alldata.apply(replace_titles, axis=1)

In [5]:
train = alldata.iloc[:train.shape[0],:]
test = alldata.iloc[train.shape[0]:,:]

# Create Model by PyCaret

In [6]:
from pycaret.classification import *
clf = setup(train, target = 'Survived',categorical_features=["Pclass","Sex","Embarked"], 
        numeric_features=["Age","SibSp","Parch","Fare"],ignore_features=["PassengerId","Name","Ticket","Cabin"])

Unnamed: 0,Description,Value
0,session_id,6901
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(891, 13)"
5,Missing Values,True
6,Numeric Features,4
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


In [7]:
compare = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8332,0.8736,0.7515,0.8096,0.7744,0.6431,0.6494,0.286
ridge,Ridge Classifier,0.8284,0.0,0.739,0.8069,0.7673,0.6324,0.6378,0.002
lda,Linear Discriminant Analysis,0.8268,0.872,0.739,0.804,0.7657,0.6293,0.6349,0.003
lightgbm,Light Gradient Boosting Machine,0.8233,0.8761,0.718,0.8076,0.7536,0.6179,0.6263,0.043
gbc,Gradient Boosting Classifier,0.8204,0.8725,0.7015,0.8266,0.7461,0.6104,0.6263,0.011
catboost,CatBoost Classifier,0.8155,0.8725,0.6808,0.8203,0.7359,0.5975,0.6103,1.187
nb,Naive Bayes,0.8091,0.8502,0.697,0.7955,0.7356,0.5883,0.5978,0.002
ada,Ada Boost Classifier,0.8058,0.8546,0.743,0.7511,0.7444,0.5883,0.5909,0.01
xgboost,Extreme Gradient Boosting,0.8041,0.8761,0.7142,0.762,0.7329,0.5795,0.5843,0.073
rf,Random Forest Classifier,0.785,0.8606,0.71,0.7319,0.7146,0.5434,0.5493,0.028


In [8]:
lr = create_model('lr')
tuned_lr = tune_model(lr, optimize = 'Accuracy')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8571,0.9209,0.7917,0.8261,0.8085,0.6947,0.6951
1,0.8254,0.9071,0.9167,0.7097,0.8,0.6495,0.6663
2,0.7302,0.8126,0.64,0.6667,0.6531,0.4324,0.4327
3,0.8548,0.9079,0.7917,0.8261,0.8085,0.6917,0.6921
4,0.8065,0.8065,0.5833,0.875,0.7,0.5654,0.5908
5,0.8871,0.9079,0.7917,0.9048,0.8444,0.7565,0.7606
6,0.8065,0.8454,0.6667,0.8,0.7273,0.5792,0.585
7,0.8387,0.8958,0.7083,0.85,0.7727,0.6493,0.6558
8,0.8065,0.761,0.6667,0.8,0.7273,0.5792,0.585
9,0.9194,0.9649,0.9583,0.8519,0.902,0.8339,0.838


# Model interpretation

In [9]:
evaluate_model(tuned_lr)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [14]:
test_predictions = predict_model(tuned_lr, data=test)
test_predictions.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Label,Score
0,892,,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,0.0,0.9304
1,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,1.0,0.5585
2,894,,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,0.0,0.9006
3,895,,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,0.0,0.9234
4,896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,1.0,0.6524


In [16]:
submission=pd.DataFrame({'PassengerId':test_predictions['PassengerId'],'Survived':test_predictions['Label']})

list = []
for i in range(test.shape[0]):
    list.append(submission.loc[i,"Survived"][0])
    
submission['Survived']=list

submission.to_csv("submission_LogisticRegression.csv",index=False)

# Make Blender models

### For aiming higher score, let's blend the models. In this time, I will blend the best 5 models and predict the score

In [22]:
blender_specific = blend_models(estimator_list = compare_models(n_select = 5), method = 'auto')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8413,0.0,0.7917,0.7917,0.7917,0.6635,0.6635
1,0.8413,0.0,0.875,0.75,0.8077,0.6739,0.6797
2,0.7302,0.0,0.64,0.6667,0.6531,0.4324,0.4327
3,0.8226,0.0,0.7083,0.8095,0.7556,0.6173,0.6207
4,0.7903,0.0,0.5833,0.8235,0.6829,0.533,0.5507
5,0.871,0.0,0.75,0.9,0.8182,0.7195,0.7266
6,0.8065,0.0,0.6667,0.8,0.7273,0.5792,0.585
7,0.8387,0.0,0.7083,0.85,0.7727,0.6493,0.6558
8,0.8065,0.0,0.6667,0.8,0.7273,0.5792,0.585
9,0.9355,0.0,0.9583,0.8846,0.92,0.8661,0.868


In [18]:
blender_final = finalize_model(blender_specific)
test_predictions = predict_model(blender_final, data=test)
test_predictions.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Label
0,892,,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,0.0
1,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,1.0
2,894,,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,0.0
3,895,,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,0.0
4,896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,1.0


In [19]:
evaluate_model(blender_final)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

<Figure size 576x396 with 0 Axes>

In [21]:
submission=pd.DataFrame({'PassengerId':test_predictions['PassengerId'],'Survived':test_predictions['Label']})

list = []
for i in range(test.shape[0]):
    list.append(submission.loc[i,"Survived"][0])
    
submission['Survived']=list

submission.to_csv("submission_Blender.csv",index=False)