Titanic Dataset Model
=====================

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import re

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [37]:
pd.options.mode.chained_assignment = None  # default='warn'

Import the Data
---------------

In [2]:
data_path = Path(Path.cwd() / 'Data')

df_train = pd.read_csv(data_path / 'train.csv')
df_test = pd.read_csv(data_path / 'test.csv')
df_gender_sub = pd.read_csv(data_path / 'gender_submission.csv')

list_of_df = [df_train, df_test]
list_of_df_names = ["Train", "Test"]

Split the Data Into Train and Test Datasets
-------------------------------------------

In [85]:
class EmptyValuesRowRemover:
    
    def __init__(self, columns=[]):
        self.columns = columns
        assert type(columns)==list, "This class expects a list of columns to be passed at initiation."
    
    def Remover(self, X):
        if self.columns == []:
            self.column_list = list(X)
        else:
            self.column_list = self.columns
        self.list_of_bools = []
        for column in self.column_list:
            X = X.loc[X.loc[:,column].notnull()]
        return X

In [80]:
class HonorificExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column):
        self.column = column
        
    # dictionary to map to generate the new feature vector
    title_dictionary = {
        "capt":"Officer", 
        "col":"Officer", 
        "major":"Officer", 
        "dr":"Officer",
        "jonkheer":"Royalty",
        "rev":"Officer",
        "countess":"Royalty",
        "dona":"Royalty",
        "don":"Royalty",
        "mr":"Mr",
        "mme":"Mrs",
        "ms":"Mrs",
        "mrs":"Mrs",
        "miss":"Miss",
        "mlle":"Miss",
        "master":"Master",
        "nan":"Mr"
    }
    
    def get_title(self, string):
        regex = re.compile(
            r'Mr|Don|Major|Capt|Jonkheer|Rev|Col|Dr|Mrs|Countess|Dona|Mme|Ms|Miss|Mlle|Master', 
            re.IGNORECASE
        )
        results = regex.search(string)
        if results != None:
            return(results.group().lower())
        else:
            return(str(np.nan))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['Title'] = X.loc[:,self.column].apply(self.get_title)
        X['Title'] = X.loc[:,'Title'].map(self.title_dictionary)
        X = X.drop(self.column, axis=1)
        return X

In [81]:
PrePreprocessingPipe = Pipeline(
    steps=[
#         ("evrr", EmptyValuesRowRemover(columns=["Age"])),
        ("he", HonorificExtractor(column="Name"))
    ]
)

In [82]:
numeric_features = ['Age', 'Fare', 'SibSp']
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [83]:
PreprocessingPipeline = Pipeline(
    steps=[
        ("pp", PrePreprocessingPipe), 
        ("ct", ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        ))
    ]
)


In [91]:
Model = Pipeline(
    steps=[
        ('pp', PreprocessingPipeline),
        ('classifier', LogisticRegression(solver='lbfgs'))
    ]
)

df_train = EmptyValuesRowRemover(columns=["Age"]).Remover(df_train)
df_test = EmptyValuesRowRemover(columns=["Age"]).Remover(df_train)

X_train = df_train.drop('Survived', axis='columns')
X_test = df_test
y_train = df_train.loc[:,'Survived']

Model.fit(X_train, y_train.values.ravel())
y_predict = Model.predict(X_test)
df_results = pd.DataFrame({"PassengerId": X_test.PassengerId, "Survived": y_predict})
df_results.to_csv("submission_NEW.csv", index=False)

In [88]:
df_train.shape

(714, 12)