In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm, linear_model
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler,PolynomialFeatures, LabelEncoder,FunctionTransformer
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import numpy as np
from scipy import stats
%matplotlib inline
titanic = pd.read_csv('train.csv')
testdf = pd.read_csv('test.csv')

  from pandas.core import datetools


In [2]:
titanic['Cabin'] =titanic['Cabin'].fillna(value='?')
titanic['Embarked'] =titanic['Embarked'].fillna(value='S')
titanic['Age'] =titanic['Age'].fillna(value='?')
testdf['Cabin'] =testdf['Cabin'].fillna(value='?')
testdf['Embarked'] =testdf['Embarked'].fillna(value='S')
testdf['Age'] =testdf['Age'].fillna(value='?')
testdf.loc[152,'Fare']=13.6755 # we fill in the missing fare from the test data here. 

In [3]:
def titlepull(title_string):
    title_list = title_string.split()
    for possible_title in title_list:
        if possible_title[-1] == '.':
            return possible_title
def cabinpull(cabin_string):
    if cabin_string[0]=='?':
        return '?'
    else:
        return cabin_string[0]
    

The most important thing to understand is that before you do a pipline, you must convert categorical variables into the pandas category object. To do this, you must check the possible values present in both training and test data, and ensure that the category contains all possibilities. Not doing this will lead to estimators working on the training data, but it will fail on the test data. 

In [4]:
titanic['Deck']=titanic.Cabin.apply(cabinpull)
titanic['Title']= titanic.Name.apply(titlepull)
testdf['Deck']=testdf.Cabin.apply(cabinpull)
testdf['Title']= testdf.Name.apply(titlepull)

In [5]:
titanic.Deck.value_counts()

?    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Deck, dtype: int64

In [6]:
testdf.Deck.value_counts()

?    327
C     35
B     18
D     13
E      9
F      8
A      7
G      1
Name: Deck, dtype: int64

The missing deck information is a real problem: even though from EDA I know that cabin matters, it is simply missing on too much of the data to be safely imputed. 

In [7]:
decks_list = ['?','C','B','D','E','F','A','G','T']

In [8]:
titanic.Title.value_counts()

Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Major.         2
Col.           2
Mlle.          2
Sir.           1
Countess.      1
Lady.          1
Capt.          1
Don.           1
Ms.            1
Jonkheer.      1
Mme.           1
Name: Title, dtype: int64

In [9]:
testdf.Title.value_counts()

Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Dona.        1
Ms.          1
Dr.          1
Name: Title, dtype: int64

There are decisions to be made at this step that are work making after having done eda. Specifically, I'm leaving the decks as dummies, rather than transforming the decks into a numeric value based on counting from the bottom or top deck. 
In the case of the titles, I'm going to use Royalty, and Officer to collect all of the weird titles. 

In [10]:
name_mappings = {
    'Dona.':'Royalty',
    'Countess.':'Royalty',
    'Sir.':'Royalty',
    'Jonkheer.':'Royalty',
    'Don.':'Royalty',
    'Lady.':'Royalty',
    'Ms.':'Miss.',
    'Col.':'Officer',
    'Capt.':'Officer',
    'Major.':'Officer',
    'Col.':'Officer',
    'Mlle.':'Miss.',
    'Mme.':'Mrs.',
    'Mrs.':'Mrs.',
    'Miss.':'Miss',
    'Mr.':'Mr.',
    'Master.':'Master.',
    'Dr.':'Dr.',
    'Rev.':'Rev.'
    
    
}


In [11]:
titanic['Title'].map(name_mappings, )

titanic['Title']= titanic['Title'].map(name_mappings)
testdf['Title']= testdf['Title'].map(name_mappings)


In [12]:
titanic.Title.value_counts()

Mr.        517
Miss       182
Mrs.       126
Master.     40
Dr.          7
Rev.         6
Officer      5
Royalty      5
Miss.        3
Name: Title, dtype: int64

In [13]:
titanic[titanic['Title'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,Title


In [14]:
testdf[testdf['Title'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,Title


In [15]:
validtitles = list(set(name_mappings.values()))

So, to explain this code further, I use the dictionary I've made for the map function and return all of the values. With all of the values, I turn them into a set, which ignores duplicates, then I transform that into a list to be fed into the category.

In [16]:
titanic.Title = titanic['Title'].astype("category", categories=validtitles, ordered=False)

In [17]:
testdf.Title = testdf['Title'].astype("category", categories=validtitles, ordered=False)

In [18]:
titanic.Deck = titanic['Deck'].astype("category", categories=decks_list, ordered=False)

In [19]:
testdf.Deck = testdf['Deck'].astype("category", categories=decks_list, ordered=False)

If I wanted to do my 'Dead_Wife' features, I would need to do that outside of a pipline. This is because pipelines cannot 'remember' things from previous runs. This is also a great indication that my 'dead spouse' feature is fun little hack for this dataset, but is unlikely to be replicated in most environments. 

In [20]:
def titlepull(title_string):
    title_list = title_string.split()
    for possible_title in title_list:
        if possible_title[-1] == '.':
            return possible_title
def cabinpull(cabin_string):
    if cabin_string[0]=='?':
        return '?'
    else:
        return cabin_string[0]
    
def tit_deck(df):
    deck_dummies = pd.get_dummies(df.Deck,prefix='Deck')
    return deck_dummies

tit_deck_tf = FunctionTransformer(tit_deck,validate=False)

def tit_class(df):
    pclass_dummies=   pd.get_dummies(df.Pclass,prefix='plcass')
    return pclass_dummies

tit_class_tf = FunctionTransformer(tit_class, validate=False)

def tit_title(df):
    title_dummies= pd.get_dummies(df.Title,prefix='Title')
    return title_dummies

tit_title_tf  = FunctionTransformer(tit_title, validate=False)



def tit_features(df):
    df['IsFemale'] = (df.Sex=='female').astype(int)
    return df[['IsFemale','Fare','Pclass']]

tit_features_tf  = FunctionTransformer(tit_features, validate=False)

def tit_family(df):
    df['FamilyCount']= (df['SibSp'])+(df['Parch'])
    fam_dummies=  pd.get_dummies(df.FamilyCount,prefix='family')
    return fam_dummies
               
tit_family_tf  = FunctionTransformer(tit_family, validate=False)


In [21]:
#age_transformers = [('family',tit_family_tf),('class',tit_class_tf),('deck',tit_deck_tf),('features',tit_features_tf),('title',tit_title_tf),('pclass',tit_pclass_tf)]

In [22]:
age_transformers =[('features',tit_features_tf),('title',tit_title_tf),('deck',tit_deck_tf),('class',tit_class_tf)]

To Explain: Any variable that is dummied, must be objected before. If you do not do this (As I didn't for family, parents, children), when a train test split occur for cross validation, it will find a value of family that is missing from the test split, and it will fail. 

In [23]:
def numeric_features(df):
    return df.select_dtypes(exclude= ['object'])
numeric_features_tf = FunctionTransformer(numeric_features, validate=False)

In [24]:
fu_age_impute = FeatureUnion(age_transformers)

So, by breaking this down into a pipeline, we can turn features on and off by re-running only a few cells. 

In [25]:
knr = KNeighborsRegressor(n_jobs=-1)
lr = LinearRegression()
ss = StandardScaler()

In [26]:
age_impute_pipe_kn =  Pipeline(
    [
        ('fu_age',fu_age_impute),
        ('ss',ss),
        ('knr',knr)
    ]

)

In [27]:
age_impute_pipe_lr =  Pipeline(
    [
        ('fu',fu_age_impute),
        ('ss',ss),
        ('lr',lr)
    ]

)

In [28]:
titanic_ages =titanic[(titanic['Age']!='?')]
titanic_no_ages = titanic[(titanic['Age']=='?')]

test_ages =testdf[(testdf['Age']!='?')]
test_no_ages = testdf[(testdf['Age']=='?')]


In [29]:
age_params = {
    
}

In [30]:
gs_age_kn = GridSearchCV(age_impute_pipe_kn, param_grid=age_params, cv=5)
gs_age_lr = GridSearchCV(age_impute_pipe_lr, param_grid=age_params, cv=5)

In [31]:
X_train = titanic_ages.drop('Age',axis=1)
y_train= titanic_ages['Age']


In [32]:
X_test1 =titanic_no_ages.drop('Age', axis=1)
X_test2 = test_no_ages.drop('Age', axis=1)

In [33]:
gs_age_lr.fit(X_train,y_train)
print(gs_age_lr.best_score_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


-2.32130031494e+22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [34]:
gs_age_kn.fit(X_train,y_train)
print(gs_age_kn.best_score_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

0.264809838666


In [35]:
age_impute_pipe_kn.fit(X_train,y_train)

Pipeline(steps=[('fu_age', FeatureUnion(n_jobs=1,
       transformer_list=[('features', FunctionTransformer(accept_sparse=False,
          func=<function tit_features at 0x7fd7106abc80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ('title', FunctionTransformer...kowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform'))])

In [36]:
age_impute_pipe_kn.score(X_train,y_train)


0.4894247266532521

In [37]:
age_impute_pipe_kn.predict(X_test1)

array([29.2, 26.2, 35.8, 32.5, 21.2, 31.4, 42.4, 25.2, 21.2, 31.4, 25.8,
       31.2, 25.2, 24.2, 38.7, 46.4, 7.4, 31.4, 25.8, 20.6, 25.8, 25.8,
       31.4, 24.8, 6.8, 25.8, 49.5, 22.8, 33.0, 29.1, 31.2, 5.8, 41.6,
       52.2, 5.4, 10.6, 41.8, 41.2, 28.8, 49.5, 25.2, 26.4, 49.5, 31.4,
       5.6, 20.0, 12.5, 11.9, 29.1, 40.8, 49.5, 25.2, 44.8, 25.2, 34.2,
       55.0, 46.4, 38.7, 25.2, 30.6, 25.9, 25.8, 30.0, 26.4, 6.0, 40.8,
       31.4, 28.8, 38.7, 32.5, 21.2, 21.2, 31.2, 35.8, 25.2, 40.8, 31.4,
       42.7, 5.6, 31.4, 28.0, 34.2, 35.8, 31.4, 29.1, 49.5, 28.8, 23.8,
       23.8, 25.8, 51.0, 49.5, 25.8, 34.2, 42.7, 29.1, 41.2, 34.2, 5.6,
       23.8, 29.3, 31.1, 19.6, 46.4, 25.8, 30.6, 32.5, 21.2, 39.4, 21.2,
       33.2, 29.3, 37.6, 23.8, 42.8, 49.5, 25.8, 22.0, 21.2, 25.2, 32.8,
       31.2, 25.8, 25.2, 10.2, 32.5, 31.4, 42.6, 28.4, 11.9, 49.5, 42.7,
       46.8, 29.5, 26.4, 29.0, 31.4, 21.8, 31.4, 24.8, 40.8, 34.2, 22.0,
       26.4, 24.2, 7.4, 40.5, 31.2, 24.2, 34.2, 31.4, 31.4,

In [38]:
X_test1.shape

(177, 14)

In [39]:
X_train.shape

(714, 14)

In [40]:
X_test1.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Deck', 'Title', 'IsFemale'],
      dtype='object')

In [41]:
X_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Deck', 'Title', 'IsFemale'],
      dtype='object')