In [1]:
import pandas as pd
import numpy as np
import sklearn

# Show all columns when displaying .head() or .describe()
pd.set_option('max_columns', None)

In [2]:
# Separate data into features and labels
X_train = pd.read_csv("train.csv")
y_train = X_train["Survived"]
X_train.drop(["Survived"], axis = 1, inplace = True)

X_test = pd.read_csv("test.csv")

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


The "Cabin" feature contains a lot of missing values; it will be better to simply drop that feature rather than impute it. The "Name" and "Ticket" features most likely are not going to affect our predicted too adversely (although there could certainly be a relationship between them and survival rate) and processing names and ticket numbers for useful information will prove quite difficult; so ignore them for now.

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Build Pipeline to impute numeric data
num_pipeline = Pipeline(steps=[("Imputer", SimpleImputer(strategy = 'mean'))])

# Build Pipeline to impute categorial data
cat_pipeline = Pipeline(steps=[("Imputer", SimpleImputer (strategy = 'most_frequent')),
                               ("LabelEncoder", OneHotEncoder())
    
])

# Use a ColumnTransformer to join the categorical and numeric imputed and encoded data
preprocess = ColumnTransformer(transformers = [("Numeric", num_pipeline, ["Pclass", "Age", "SibSp", "Parch", "Fare"]),
                                               ("Categorical", cat_pipeline, ["Sex", "Embarked"])
    
])

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier (n_estimators = 250, random_state = 0)

After experimenting with several models(KNN, SVM, RandomForestClassifiers), a random forest classifier seems to perform the best on this data set (with about 81% cross validation accuracy score). So, train a classifier with 250 trees.

In [6]:
final_model = Pipeline(steps=[
    ('Preprocessor', preprocess),
    ('Model', model)
])

final_model.fit(X_train, y_train)

Pipeline(steps=[('Preprocessor',
                 ColumnTransformer(transformers=[('Numeric',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer())]),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('Categorical',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('LabelEncoder',
                                                                   OneHotEncoder())]),
                                                  ['Sex', 'Embarked'])])),
                ('Model',
                 RandomForestClassifier(n_estimators=250, random_state=

In [7]:
from sklearn.model_selection import cross_val_score

# Use cross validation to score the model on the training set (since our dataset is rather small and we can't afford to split it)
# into a training and validation sets

accuracy = cross_val_score(final_model, X_train, y_train, cv = 5)
print (accuracy.mean())

0.8103571652752495


In [8]:
# Predictions on test data and save predictions to a csv for Kaggle submission

predictions = final_model.predict(X_test)
print (predictions)
#  submission = pd.DataFrame({"PassengerId": X_test.PassengerId,
#                          "Survived": predictions})
# submission.to_csv("submission.csv", index = False)

[0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0
 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0
 1 1 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1
 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 1]
