In [52]:
# Re-Train Classification Model for Titanic
# calculate Cross-Validation Precision, Recall, F-Score
# create ML pipeline
# check for missing values
# handle categorical data

# Loading Data
import numpy as np
import pandas as pd

# Read Training & Testing Data
X_full = pd.read_csv("./titanic/train.csv")
X_test = pd.read_csv("./titanic/test.csv")


# X_full has 891 rows, 12 columns
# NOTE: too many missing values in Cabin to get rid of all observations; 
# better to not use cabin
X_full.shape

X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
# Sum all missing values (Training Data) (Age, n = 177; Cabin, n = 687; Embarked, n = 2)
# NOTE: NO missing target "Survived"

X_full.isnull().sum()

# Sum all missing values (Testing Data) (Age, n = 86; Cabin, n = 327; Fare, n = 1)
#X_test.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
# Remove rows with missing "target" (no rows needed to be removed)

# Separate target from predictors
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

#X_full.shape  (11 rows instead of 12)

In [7]:
X_full.shape

(891, 11)

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Break off validation set from training data
# Divide "full data" into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)



In [14]:
#X_train_full.head()
#X_valid_full.head()
#y_train.head()
#y_valid.head()

495    0
648    0
278    0
31     1
255    1
Name: Survived, dtype: int64

In [10]:
# Missing Values
# First, Need to drop Cabin column, drop rows with missing values in 'Age' OR (extension to imputation of Age)
# Then, Select Categorical columns with relatively low cardinality
# Then, Select Numerical Columns
X_train_full.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
140,141,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C
439,440,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
817,818,2,"Mallet, Mr. Albert",male,31.0,1,1,S.C./PARIS 2079,37.0042,,C
378,379,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,,C
491,492,3,"Windelov, Mr. Einar",male,21.0,0,0,SOTON/OQ 3101317,7.25,,S


In [None]:
# Drop Cabin column with missing values
missing_col = ['Cabin']

# Drop Cabin from training and validation data
reduced_X_train_full = X_train_full.drop(missing_col, axis=1)
reduced_X_valid_full = X_valid_full.drop(missing_col, axis=1)
reduced_X_test = X_test.drop(missing_col, axis=1)

In [23]:
# Prepare to Impute 'Age'

age_missing = ['Age']

['Age']


In [24]:
# Create an Extension to Imputation on 'Age'

# Make copy to avoid changing original data (when imputing)
X_train_plus = reduced_X_train_full.copy()
X_valid_plus = reduced_X_valid_full.copy()
X_test_plus = reduced_X_test.copy()

In [28]:
# Make new columns indicating what will be imputated (Extension to Imputation, see above)
for col in age_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
    X_test_plus[col + '_was_missing'] = X_test_plus[col].isnull()    # not needed for X_test
    

In [40]:
X_train_plus.head()
X_valid_plus.head()
X_test_plus.head() # actually no need to create one for X_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Age_was_missing
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,False
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,False
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,False
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,False


In [46]:
## PREPARING FOR PIPELINES ##

# Cardinality means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)

categorical_cols = [cname for cname in X_train_plus.columns if X_train_plus[cname].nunique() < 10 and X_train_plus[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_plus.columns if X_train_plus[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train_pipe = X_train_plus[my_cols].copy()
X_valid_pipe = X_valid_plus[my_cols].copy()

In [48]:
X_valid_pipe.head()

Unnamed: 0,Sex,Embarked,PassengerId,Pclass,Age,SibSp,Parch,Fare
495,male,C,496,3,,0,0,14.4583
648,male,S,649,3,,0,0,7.55
278,male,Q,279,3,7.0,4,1,29.125
31,female,C,32,1,,1,0,146.5208
255,female,C,256,3,29.0,0,2,15.2458


In [49]:
# Imputation + Pipeline
# imputes missing values in Numerical data (Age)
# imputes missing values in Categorical Data, w/ one-hot encoding (Embark)

# Define Preprocessing Steps

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="constant")

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [50]:
# NOTE: Trying with original first submission model, but with missing data imputation and pipeline
# Define the Model

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

In [54]:
# Creating and Evaluating the Pipeline
# NOTE: Using cross_val_score to evaluate Classification Model

from sklearn.model_selection import cross_val_score

#cross_val_score(sgd_clf, X_train1, y_train_5f, cv=3, scoring='accuracy')

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train_pipe, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': preds})

#output.to_csv('new_rf_submission_titanic.csv', index=False)
print("Submission outputted and saved")

# Evaluate the model
#score = cross_val_score(model, X_train_pipe, y_train, cv=3, scoring='accuracy')
#print("Cross Validation Score:", score)

Submission outputted and saved




In [60]:
############################################
#### NOT GREAT - predict all 0 or 1    #####
############################################

# Try building SGDClassifier
# Pick a classifier - Stochastic Gradient Descent (SGD)

from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=42)

In [61]:
############################################
#### NOT GREAT - predict all 0 or 1    #####
############################################


# Bundle preprocessing and (sgd) modeling code in a pipeline
sgd_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', sgd_model)
                              ])

# preprocessing of training data, fit model
sgd_pipeline.fit(X_train_pipe, y_train)

# preprocessing of validation data, get predictions
sgd_preds = sgd_pipeline.predict(X_test)

sgd_output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': sgd_preds})
#sgd_output.to_csv('new_sgd_submission_titanic.csv', index=False)
#print("Submission SGD output and saved")

Submission SGD output and saved


