In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Pipeline at a glance:
load data -> cleaning/preprocessing -> feature eng -> format data for modelling -> fit model -> evaluate model -> generate submission file

## Step 1: Load data

In [14]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# for convenience
datasets = [df_train, df_test]

## Step 2: Basic cleaning and preprocessing

In [15]:
# Fill in missing values
#mAge = pd.concat((df_train['Age'], df_test['Age']), axis=0).mean()
medFare = pd.concat((df_train['Fare'], df_test['Fare']), axis=0).median()
for df in datasets:
    #df['Age'] = df['Age'].fillna(mAge)
    df['Fare'] = df['Fare'].fillna(medFare)
    df['Embarked'] = df['Embarked'].fillna('S')

## Step 3: Feature engineering

In [16]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
    
train_titles = []
test_titles = []

for name in df_train['Name']:
    title = get_title(name)
    train_titles.append(title)
    
for name in df_test['Name']:
    title = get_title(name)
    test_titles.append(title)


df_train['Title'] = train_titles
df_test['Title'] = test_titles
     
df_train.loc[(df_train['Title'] != 'Mr') & (df_train['Title'] != 'Mrs') & (df_train['Title'] != 'Miss') & (df_train['Title'] != 'Master'), 'Title'] = 'Misc'
df_test.loc[(df_test['Title'] != 'Mr') & (df_test['Title'] != 'Mrs') & (df_test['Title'] != 'Miss') & (df_test['Title'] != 'Master'), 'Title'] = 'Misc'

# See value counts of each title
pd.value_counts(df_train['Title'])

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64

In [19]:
# so i guess grouped is of length train+test. which is i think where things are getting screwed up

grouped = (pd.concat((df_train, df_test), ignore_index=True)).groupby(['Title'])  

# opted for median value to iron out any weirdness title selection might've given
grouped['Age'].median()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Title
Master     4.0
Misc      44.5
Miss      22.0
Mr        29.0
Mrs       35.5
Name: Age, dtype: float64

In [20]:
#df_test - without NaN changes
df_test['Age']

0      34.5
1      47.0
2      62.0
3      27.0
4      22.0
5      14.0
6      30.0
7      26.0
8      18.0
9      21.0
10      NaN
11     46.0
12     23.0
13     63.0
14     47.0
15     24.0
16     35.0
17     21.0
18     27.0
19     45.0
20     55.0
21      9.0
22      NaN
23     21.0
24     48.0
25     50.0
26     22.0
27     22.5
28     41.0
29      NaN
       ... 
388    21.0
389     6.0
390    23.0
391    51.0
392    13.0
393    47.0
394    29.0
395    18.0
396    24.0
397    48.0
398    22.0
399    31.0
400    30.0
401    38.0
402    22.0
403    17.0
404    43.0
405    20.0
406    23.0
407    50.0
408     NaN
409     3.0
410     NaN
411    37.0
412    28.0
413     NaN
414    39.0
415    38.5
416     NaN
417     NaN
Name: Age, Length: 418, dtype: float64

In [21]:
# This is where it gets funky

# this works great
df_train['Age'] = df_train['Age'].fillna(grouped.Age.apply(lambda x: x.fillna(x.median())))

# this doesn't - is replacing with ages from train. see index: 10, 22, 29 in test following this function
# compare them to the matching indices in train, ages are the same
df_test['Age'] = df_test['Age'].fillna(grouped.Age.apply(lambda x: x.fillna(x.median())))
df_test['Age']

0      34.5
1      47.0
2      62.0
3      27.0
4      22.0
5      14.0
6      30.0
7      26.0
8      18.0
9      21.0
10      4.0
11     46.0
12     23.0
13     63.0
14     47.0
15     24.0
16     35.0
17     21.0
18     27.0
19     45.0
20     55.0
21      9.0
22     15.0
23     21.0
24     48.0
25     50.0
26     22.0
27     22.5
28     41.0
29     29.0
       ... 
388    21.0
389     6.0
390    23.0
391    51.0
392    13.0
393    47.0
394    29.0
395    18.0
396    24.0
397    48.0
398    22.0
399    31.0
400    30.0
401    38.0
402    22.0
403    17.0
404    43.0
405    20.0
406    23.0
407    50.0
408    21.0
409     3.0
410    29.0
411    37.0
412    28.0
413    29.0
414    39.0
415    38.5
416    34.0
417    18.0
Name: Age, Length: 418, dtype: float64

In [23]:
df_train['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5      29.0
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17     29.0
18     31.0
19     35.5
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26     29.0
27     19.0
28     22.0
29     29.0
       ... 
861    21.0
862    48.0
863    22.0
864    24.0
865    42.0
866    27.0
867    31.0
868    29.0
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878    29.0
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888    22.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [24]:
# find fare bins based on frequency rather than value
garbage, fare_bins = pd.qcut(df_train['Fare'].append(df_test['Fare']), 4, retbins=True)
# loops through both train and test set for convenience
for df in datasets:
    # bin age
    df['Age_binned'] = pd.cut(df['Age'], [0,16,32,48,64,200], labels = [0,1,2,3,4], retbins=False)
    
    # bin fare
    df['Fare_binned'] = pd.cut(df['Fare'], fare_bins, labels = [0,1,2,3], include_lowest=True, retbins=False)
    
    # family features
    df['Family_size'] = df_train['SibSp'] + df_train['Parch']
    df['Is_Alone'] = (df['Family_size'] == 0).astype(int)

In [25]:
# drop unneeded rows
for df in datasets:
    df.drop(['Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Fare_binned', 'Embarked', 'Is_Alone'], axis=1, inplace=True)
    
# need to keep passengerID for submission
# now dropping embarked and is_alone, since they're probably useless

## Step 4: Format data for modelling

In [26]:
# label encode sex since binary
le = LabelEncoder()
le.fit(df_train['Sex'])
df_train.loc[:,'Sex'] = le.transform(df_train['Sex'])
df_test.loc[:,'Sex'] = le.transform(df_test['Sex'])

# one hot encode title
df_train = pd.get_dummies(df_train, columns=['Title'])
df_test = pd.get_dummies(df_test, columns=['Title'])

In [27]:
# Split data into X and y, and select features to use
X = df_train.drop(['Survived', 'PassengerId'], axis=1)
y = df_train['Survived']
X_test = df_test.drop(['PassengerId'], axis=1)

In [28]:
# Split data X and y into train and val sets
X_train, X_val, y_train, y_val = train_test_split(
      X, y, test_size=0.2, random_state=42)

In [29]:
X_train

Unnamed: 0,Pclass,Sex,Age_binned,Family_size,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
331,1,1,2,0,0,0,0,1,0
733,2,1,1,0,0,0,0,1,0
382,3,1,1,0,0,0,0,1,0
704,3,1,1,1,0,0,0,1,0
813,3,0,0,6,0,0,1,0,0
118,1,1,1,1,0,0,0,1,0
536,1,1,2,0,0,1,0,0,0
361,2,1,1,1,0,0,0,1,0
29,3,1,1,0,0,0,0,1,0
55,1,1,1,0,0,0,0,1,0


In [30]:
df_train.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Sex             0
Age_binned      0
Family_size     0
Title_Master    0
Title_Misc      0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
dtype: int64

## Step 5: Train (or fit) the model

In [31]:
# Fit logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
# fit rf
rf = RandomForestClassifier(random_state=12)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=12, verbose=0, warm_start=False)

## Step 6: Evaluate the model

In [33]:
# define a scoring function
def acc(y: np.array, y_pred: np.array) -> float:
    return np.sum(y_pred==y)/len(y)

In [34]:
# Use model to predict on train and val sets
y_pred_trn_lr = lr.predict(X_train)
y_pred_val_lr = lr.predict(X_val)

y_pred_trn_rf = rf.predict(X_train)
y_pred_val_rf = rf.predict(X_val)

In [35]:
print('LR train set accuracy', acc(y_train, y_pred_trn_lr))
print('LR val set accuracy', acc(y_val, y_pred_val_lr))
print('RF train set accuracy', acc(y_train, y_pred_trn_rf))
print('RF val set accuracy', acc(y_val, y_pred_val_rf))

LR train set accuracy 0.8314606741573034
LR val set accuracy 0.8100558659217877
RF train set accuracy 0.8525280898876404
RF val set accuracy 0.8044692737430168


## Step 7: Generate Submission

In [36]:
# fit on whole dataset
lr.fit(X, y)

# Predict for test set
y_pred_test = lr.predict(X_test)

# Create a Kaggle submission
sub = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                    'Survived': y_pred_test})

sub.to_csv('week_4_baseline.csv', index=False)

