score = .77272

# Import, Load, EDA

In [None]:
# Import Libraries
!pip install --pre pycaret[full]



In [None]:
import pandas as pd
from pycaret.classification import *

In [None]:
# Load and read data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

We installed pycaret classification and loaded the data

EDA

In [None]:
# preparing training set
def engineer_features(df):
    # Extract titles
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Age bands
    df['AgeBand'] = pd.cut(df['Age'], bins=[0,12,20,30,40,50,60,70,80])

    # Fare ranges
    df['FareBand'] = pd.qcut(df['Fare'], q=4)

    # Deck from cabin
    df['Deck'] = df['Cabin'].str[0]

    # Family size grouping
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    return df

train = engineer_features(train)
test = engineer_features(test)

This feature engineering extracts the titles, replaces similar names for easier analysis, and then creates categories based off of age, fare price, deck, and family size to help us in our analysis of titanic survival rates.

In [None]:
# Read and prepare data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = engineer_features(train)
test = engineer_features(test)

In [None]:
# EDA
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,1,1


In [None]:
train.nunique()

Unnamed: 0,0
PassengerId,891
Survived,2
Pclass,3
Name,891
Sex,2
Age,88
SibSp,7
Parch,7
Ticket,681
Fare,248


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Title        891 non-null    object 
 13  FamilySize   891 non-null    int64  
 14  IsAlone      891 non-null    int64  
dtypes: float64(2), int64(7), object(6)
memory usage: 104.5+ KB


# Pycaret

In [None]:
# Initialize setup w/features
clf = setup(data=train,
           target='Survived',
           numeric_features=['Age', 'Fare', 'FamilySize'],
           categorical_features=['Sex', 'Embarked', 'Title', 'Pclass', 'IsAlone', 'Deck', 'AgeBand', 'FareBand'],
           ignore_features=['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'],
           session_id=123,
           normalize=True,
           fix_imbalance=True,
           fold_strategy='stratifiedkfold',
           fold=10,
           use_gpu=True)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 18)"
4,Transformed data shape,"(1036, 36)"
5,Transformed train set shape,"(768, 36)"
6,Transformed test set shape,"(268, 36)"
7,Ignore features,6
8,Numeric features,3
9,Categorical features,8


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


This feature engineering extracts the titles, replaces similar names for easier analysis, and then creates categories based off of age, fare price, deck, and family size to help us in our analysis of titanic survival rates.

In [None]:
# Compare models
best_models = compare_models(n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8329,0.8549,0.7447,0.8058,0.7729,0.6413,0.6435,0.683
ada,Ada Boost Classifier,0.8233,0.8461,0.7908,0.7594,0.7726,0.6284,0.6313,0.402
lr,Logistic Regression,0.8072,0.8499,0.7739,0.7406,0.755,0.5965,0.5989,0.297
rf,Random Forest Classifier,0.8026,0.8405,0.7277,0.7562,0.7391,0.5807,0.5835,0.644
ridge,Ridge Classifier,0.8024,0.8529,0.7571,0.7414,0.7456,0.5845,0.5887,0.276
lda,Linear Discriminant Analysis,0.8008,0.8529,0.7571,0.7376,0.7439,0.5813,0.5853,0.281
knn,K Neighbors Classifier,0.7834,0.827,0.7444,0.7127,0.7236,0.5463,0.5521,0.251
et,Extra Trees Classifier,0.7737,0.8101,0.69,0.7197,0.701,0.5192,0.5227,0.59
dt,Decision Tree Classifier,0.7736,0.7547,0.7154,0.7034,0.7076,0.5231,0.525,0.255
svm,SVM - Linear Kernel,0.7094,0.7388,0.7022,0.6246,0.6557,0.4103,0.4145,0.261


Processing:   0%|          | 0/71 [00:00<?, ?it/s]

This tells us that our top 3 performing models are gradient boost, ada boost, and logistic regression. We are looking especially at the accuracy metric as that tells us how likely the predictions are to be classified correctly. These are now the models we will continue with for the rest of the analysis.

In [None]:
# Tune the top 3 models (GBC, Ada Boost, and Logistic Regression)
tuned_gbc = tune_model(create_model('gbc'), optimize='AUC', n_iter=50)
tuned_ada = tune_model(create_model('ada'), optimize='AUC', n_iter=50)
tuned_lr = tune_model(create_model('lr'), optimize='AUC', n_iter=50)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.873,0.8531,0.8333,0.8333,0.8333,0.7308,0.7308
1,0.8571,0.82,0.7917,0.8261,0.8085,0.6947,0.6951
2,0.8571,0.9316,0.75,0.8571,0.8,0.6897,0.6934
3,0.8548,0.8885,0.7391,0.85,0.7907,0.6804,0.6843
4,0.7903,0.7747,0.6667,0.7619,0.7111,0.5477,0.5507
5,0.8065,0.8289,0.6667,0.8,0.7273,0.5792,0.585
6,0.871,0.864,0.8333,0.8333,0.8333,0.7281,0.7281
7,0.7903,0.8158,0.75,0.72,0.7347,0.5615,0.5618
8,0.7742,0.8739,0.625,0.75,0.6818,0.509,0.5141
9,0.8548,0.898,0.7917,0.8261,0.8085,0.6917,0.6921


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8254,0.8456,0.7083,0.8095,0.7556,0.6207,0.624
1,0.8095,0.828,0.75,0.75,0.75,0.5962,0.5962
2,0.9206,0.9605,0.9583,0.8519,0.902,0.8357,0.8398
3,0.8065,0.8919,0.6522,0.7895,0.7143,0.5699,0.5759
4,0.7903,0.7873,0.7083,0.7391,0.7234,0.5547,0.555
5,0.7581,0.8098,0.7083,0.68,0.6939,0.494,0.4943
6,0.9032,0.8969,0.875,0.875,0.875,0.7961,0.7961
7,0.7258,0.8268,0.7083,0.6296,0.6667,0.4352,0.4373
8,0.7581,0.8405,0.5833,0.7368,0.6512,0.4698,0.4773
9,0.8387,0.9282,0.875,0.75,0.8077,0.6702,0.6761


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8571,0.8499,0.8333,0.8,0.8163,0.6995,0.6999
1,0.8413,0.7986,0.7917,0.7917,0.7917,0.6635,0.6635
2,0.8571,0.9017,0.9167,0.7586,0.8302,0.7088,0.7182
3,0.8226,0.8144,0.7826,0.75,0.766,0.6232,0.6236
4,0.7742,0.8114,0.625,0.75,0.6818,0.509,0.5141
5,0.7581,0.8202,0.7083,0.68,0.6939,0.494,0.4943
6,0.8871,0.869,0.875,0.84,0.8571,0.7639,0.7643
7,0.7742,0.7785,0.75,0.6923,0.72,0.5313,0.5325
8,0.8065,0.869,0.7083,0.7727,0.7391,0.5857,0.5871
9,0.8548,0.9485,0.9167,0.7586,0.8302,0.7054,0.715


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8571,0.8584,0.7917,0.8261,0.8085,0.6947,0.6951
1,0.7778,0.7874,0.6667,0.7273,0.6957,0.5212,0.5224
2,0.8889,0.9578,0.9167,0.8148,0.8627,0.77,0.7737
3,0.8548,0.8946,0.7826,0.8182,0.8,0.6862,0.6866
4,0.7419,0.8037,0.625,0.6818,0.6522,0.4477,0.4487
5,0.7581,0.7818,0.7083,0.68,0.6939,0.494,0.4943
6,0.871,0.8832,0.875,0.8077,0.84,0.7322,0.7338
7,0.7581,0.813,0.7083,0.68,0.6939,0.494,0.4943
8,0.7742,0.8558,0.625,0.75,0.6818,0.509,0.5141
9,0.8226,0.9342,0.8333,0.7407,0.7843,0.6345,0.6377


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8095,0.8339,0.7917,0.7308,0.76,0.6025,0.6039
1,0.8254,0.8088,0.75,0.7826,0.766,0.6268,0.6272
2,0.873,0.9402,0.9167,0.7857,0.8462,0.7391,0.7455
3,0.8387,0.8941,0.7391,0.8095,0.7727,0.6481,0.6497
4,0.7581,0.8202,0.6667,0.6957,0.6809,0.4862,0.4865
5,0.7742,0.7971,0.7083,0.7083,0.7083,0.5241,0.5241
6,0.8871,0.8717,0.875,0.84,0.8571,0.7639,0.7643
7,0.7097,0.7577,0.75,0.6,0.6667,0.4151,0.4232
8,0.7903,0.8432,0.7083,0.7391,0.7234,0.5547,0.555
9,0.8065,0.932,0.8333,0.7143,0.7692,0.6043,0.6096


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7937,0.8467,0.7917,0.7037,0.7451,0.5728,0.5756
1,0.8254,0.797,0.75,0.7826,0.766,0.6268,0.6272
2,0.873,0.953,0.9583,0.7667,0.8519,0.7431,0.7573
3,0.8226,0.8974,0.7391,0.7727,0.7556,0.6164,0.6168
4,0.7581,0.8476,0.6667,0.6957,0.6809,0.4862,0.4865
5,0.7581,0.8015,0.7083,0.68,0.6939,0.494,0.4943
6,0.8871,0.8761,0.875,0.84,0.8571,0.7639,0.7643
7,0.6935,0.7928,0.7083,0.5862,0.6415,0.378,0.3832
8,0.7903,0.8498,0.7083,0.7391,0.7234,0.5547,0.555
9,0.8548,0.9485,0.9167,0.7586,0.8302,0.7054,0.715


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [None]:
# Create blended model
blended = blend_models([tuned_gbc, tuned_ada, tuned_lr])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8413,0.8499,0.7917,0.7917,0.7917,0.6635,0.6635
1,0.8095,0.7927,0.75,0.75,0.75,0.5962,0.5962
2,0.873,0.9541,0.9583,0.7667,0.8519,0.7431,0.7573
3,0.8387,0.8952,0.7826,0.7826,0.7826,0.6544,0.6544
4,0.7742,0.8333,0.6667,0.7273,0.6957,0.5167,0.5179
5,0.7581,0.7982,0.7083,0.68,0.6939,0.494,0.4943
6,0.8871,0.8794,0.875,0.84,0.8571,0.7639,0.7643
7,0.6935,0.8081,0.7083,0.5862,0.6415,0.378,0.3832
8,0.7903,0.8465,0.6667,0.7619,0.7111,0.5477,0.5507
9,0.8387,0.9485,0.9167,0.7333,0.8148,0.6751,0.6883


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

This takes the top 3 models and creates a blend of their best feature to output the best results.

In [None]:
# Make predictions
predictions = predict_model(blended, data=test)

This creates the survivor predictions.

In [None]:
# Save submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions['prediction_label']
})
submission.to_csv('submission.csv', index=False)