In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

Data Overview

In [2]:
print(train_data.columns)
print(test_data.columns)
print(len(train_data), 'records for the training dataset.')
print(len(test_data), 'records for the testing dataset. ')
print(train_data.isnull().sum())
print('-----------')
print('-----------')
print(test_data.isnull().sum())
for f in test_data.columns:
    print(train_data[[f, 'Survived']].groupby(f).mean().sort_values(by='Survived', ascending=False))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
891 records for the training dataset.
418 records for the testing dataset. 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
-----------
-----------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
             Survived
PassengerId          
446               1.0
320               1.0
335               1.0
331               1.0
330        

In [3]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Data Preprocessing

In [4]:
def get_title(var):
    title = var.split(',')[1]
    title2 = title.split('.')[0]
    return title2[1:]

In [5]:
def get_ftype(var):
  if var<5:
      return 'small'
  elif var<10:
      return 'medium'
  else:
      return 'large'

In [6]:
# Preprocess the data
def preprocess_data(data):
    data['Title'] = data.apply(lambda x: get_title(x.Name), axis = 1)
    data['FamSize'] = data.apply(lambda x: x.Parch+x.SibSp,axis=1)
    data['FamType'] = data.apply(lambda x: get_ftype(x.FamSize),axis=1)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    scaler = StandardScaler()
    data["AgeBucket"] = data["Age"] // 15 * 15
    data['AgeBucket'] = scaler.fit_transform(data[['AgeBucket']])
    data['Age'] = scaler.fit_transform(data[['Age']])
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['FamSize'] = scaler.fit_transform(data[['FamSize']])
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data["Fare2"] = data["Fare"] // 1000 * 1000 
    data['Fare2'] = scaler.fit_transform(data[['Fare2']])
    data['Fare'] = scaler.fit_transform(data[['Fare']])
    # Encode categorical columns
    label_encoder = OneHotEncoder(sparse_output=False)
    data['Sex'] = label_encoder.fit_transform(data['Sex'].values.reshape(-1,1))
    data['Title'] = label_encoder.fit_transform(data['Title'].values.reshape(-1,1))
    data['FamType'] = label_encoder.fit_transform(data['FamType'].values.reshape(-1,1))
    data['Embarked'] = label_encoder.fit_transform(data['Embarked'].values.reshape(-1,1))
    return data

tr_data = preprocess_data(train_data)
te_data = preprocess_data(test_data) #'transform' would be more suitable

In [8]:
tdata = tr_data.copy()
corr_matrix = tdata.corr(numeric_only=True)
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Embarked       0.085221
Parch          0.081629
Fare2          0.042470
Sex            0.011329
FamSize        0.011329
AgeBucket      0.008185
PassengerId   -0.005007
FamType       -0.026456
Age           -0.026456
Title         -0.026456
SibSp         -0.035322
Pclass        -0.338481
Name: Survived, dtype: float64

Selecting Features 

In [7]:
features = ['Pclass','Title','Age','Sex','Fare','Fare2','AgeBucket','FamSize','FamType','Embarked']
target = 'Survived'

X = tr_data[features]
y = tr_data[target]

Model Training and Comparisons

In [8]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
accuracy = []
table = []
# Train the model
emodel = ExtraTreesClassifier(n_estimators=100, random_state=42)
xmodel = XGBClassifier(random_state=42)
fmodel = RandomForestClassifier(n_estimators=100, random_state=42)
lmodel = LogisticRegression(random_state=42)
svm_clf = SVC(gamma="auto",random_state=42)
voting_clf = VotingClassifier(
 estimators=[('rf', fmodel), ('lr', lmodel), ('svc', svm_clf),('et',emodel)],
 voting='hard')

models = {"ExtraTreesClassifier":emodel,"XGBClassifier":xmodel,"RandomForestClassifier":fmodel,"LogisticRegression":lmodel,"SVC":svm_clf,"VotingClassifier":voting_clf}
for model in models:
    print(model)
    models[model].fit(X_train, y_train)
    y_pred = models[model].predict(X_val)
    print("Accuracy: ",accuracy_score(y_val, y_pred))
    print(classification_report(y_val,y_pred))

ExtraTreesClassifier
Accuracy:  0.7932960893854749
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       105
           1       0.73      0.78      0.76        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.79      0.79       179

XGBClassifier
Accuracy:  0.7932960893854749
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       105
           1       0.74      0.77      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179

RandomForestClassifier
Accuracy:  0.8268156424581006
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       105
           1       0.81      0.76      0.78        74

    accuracy                      

Model Optimizations

In [9]:
LR = LogisticRegression(random_state=42,max_iter=3000)
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(-2, 2, 100)


lr_param_grid = [{'penalty': ['l2'], 'C': C},
                 {'penalty': ['l1'], 'C': C, 'solver':['saga']}]

clf = GridSearchCV(LogisticRegression(), lr_param_grid)

gsLR = GridSearchCV(LR,param_grid = lr_param_grid, cv=5, scoring="accuracy")

gsLR.fit(X_train,y_train)

LR_best = gsLR.best_estimator_

print('Best Penalty:', LR_best.get_params()['penalty'])
print('Best C:', LR_best.get_params()['C'])

Best Penalty: l1
Best C: 0.11233240329780277


In [10]:
param_grid = [
 {'C': np.logspace(-2, 2, 10), 'gamma': np.logspace(-2, 2, 10)}
 ]
svm_clf = SVC(random_state=42)
grid_search = GridSearchCV(svm_clf, param_grid, cv=5,
 scoring='accuracy',
return_train_score=True)
grid_search.fit(X_train, y_train)
svc = grid_search.best_estimator_
grid_search.best_estimator_

In [19]:
param_grid = [
 {'class_weight': ['balanced',None],'n_estimators': [50, 100, 150], 'max_features': [2, 4, 6, 8]},
 {'bootstrap':[False], 'n_estimators': [50, 100, 150], 'max_features': [2, 3, 4]},
 ]
forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
 scoring='accuracy',
return_train_score=True)
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
forest

Model Comparisons II

In [28]:
models = {"ExtraTreesClassifier":emodel,"XGBClassifier":xmodel,"RandomForestClassifier":forest,"LogisticRegression":LR_best,"SVC":svc,"VotingClassifier":voting_clf}
for model in models:
    print(model)
    score = cross_val_score(models[model], X_train, y_train, cv=10)
    print("Mean Accuracy: ",score.mean())
    y_pred = models[model].predict(X_val)
    print(classification_report(y_val,y_pred))

ExtraTreesClassifier
Mean Accuracy:  0.7711071987480438
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       105
           1       0.73      0.78      0.76        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.79      0.79       179

XGBClassifier
Mean Accuracy:  0.7977699530516432
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       105
           1       0.74      0.77      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179

RandomForestClassifier
Mean Accuracy:  0.7907472613458529
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       105
           1       0.76      0.76      0.76        74

    accuracy       

Submission

In [26]:
model = voting_clf
model.fit(X_train, y_train)
X_test = te_data[features]
predictions = model.predict(X_test)

# Prepare submission file
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
submission.to_csv('submission.csv', index=False)