In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Pipeline at a glance:
load data -> cleaning/preprocessing -> feature eng -> format data for modelling -> fit model -> evaluate model -> generate submission file

## Step 1: Load data

In [8]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# for convenience
datasets = [df_train, df_test]

## Step 2: Basic cleaning and preprocessing

In [9]:
# Fill in missing values
#mAge = pd.concat((df_train['Age'], df_test['Age']), axis=0).mean()
medFare = pd.concat((df_train['Fare'], df_test['Fare']), axis=0).median()
for df in datasets:
    #df['Age'] = df['Age'].fillna(mAge)
    df['Fare'] = df['Fare'].fillna(medFare)
    df['Embarked'] = df['Embarked'].fillna('S')

## Step 3: Feature engineering

In [10]:
# a blessed kernel: https://www.kaggle.com/manuelatadvice/feature-engineering-titles
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
    
train_titles = []
test_titles = []

for name in df_train['Name']:
    title = get_title(name)
    train_titles.append(title)
    
for name in df_test['Name']:
    title = get_title(name)
    test_titles.append(title)


df_train['Title'] = train_titles
df_test['Title'] = test_titles
     
df_train.loc[(df_train['Title'] != 'Mr') & (df_train['Title'] != 'Mrs') & (df_train['Title'] != 'Miss') & (df_train['Title'] != 'Master'), 'Title'] = 'Misc'
df_test.loc[(df_test['Title'] != 'Mr') & (df_test['Title'] != 'Mrs') & (df_test['Title'] != 'Miss') & (df_test['Title'] != 'Master'), 'Title'] = 'Misc'

#hahahahahahahaha this was very bad of me and i am glad i didnt stick w it 
'''
df_train.loc[df_train['Title'] == 'Dr', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Rev', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Mlle', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Col', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Major', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'the Countess', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Sir', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Capt', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Don', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Lady', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Ms', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Jonkheer', 'Title'] = 'Misc'
df_train.loc[df_train['Title'] == 'Mme', 'Title'] = 'Misc'
'''

pd.value_counts(df_train['Title'])


Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64

In [15]:
thing = (pd.concat((df_train, df_test), axis=0)).groupby(['Title']).mean()
print(thing)

              Age       Fare     Parch  PassengerId    Pclass     SibSp  \
Title                                                                     
Mr      32.252151  24.782523  0.159841   659.554822  2.381770  0.282695   
Mrs     36.994118  50.560723  0.832487   685.949239  1.939086  0.659898   
Miss    21.774238  42.684873  0.503846   616.753846  2.346154  0.665385   
Master   5.482642  36.008195  1.377049   658.852459  2.655738  2.049180   
Misc    42.656250  45.574756  0.117647   659.823529  1.382353  0.323529   

        Survived  
Title             
Mr      0.156673  
Mrs     0.792000  
Miss    0.697802  
Master  0.575000  
Misc    0.444444  


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [26]:
'''
def selectAge(title):
    if title == 'Mr':
        return 32.252
    elif title == 'Mrs':
        return 36.994
    elif title == 'Miss':
        return 21.774
    elif title == 'Master':
        return 5.483
    else:
        return 42.656
'''

for df in datasets:
    df['Age'].fillna(df.groupby(["Title"])["Age"].transform(np.mean), inplace=True)



DataError: No numeric types to aggregate

In [4]:
# find fare bins based on frequency rather than value
garbage, fare_bins = pd.qcut(df_train['Fare'].append(df_test['Fare']), 4, retbins=True)
print(fare_bins)
# loops through both train and test set for convenience
for df in datasets:
    # bin age
    df['Age_binned'] = pd.cut(df['Age'], [0,16,32,48,64,200], labels = [0,1,2,3,4], retbins=False)
    
    # bin fare
    df['Fare_binned'] = pd.cut(df['Fare'], fare_bins, labels = [0,1,2,3], include_lowest=True, retbins=False)
    
    # family features
    df['Family_size'] = df_train['SibSp'] + df_train['Parch']
    df['Is_Alone'] = (df['Family_size'] == 0).astype(int)

[  0.       7.8958  14.4542  31.275  512.3292]


In [None]:
# drop unneeded rows
for df in datasets:
    df.drop(['Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name'], axis=1, inplace=True)
    
# need to keep passengerID for submission

## Step 4: Format data for modelling

In [None]:
# one-hot encode categoricals using pd.get_dummies
df_train = pd.get_dummies(df_train, columns=['Embarked', 'Sex'])
df_test = pd.get_dummies(df_test, columns=['Embarked', 'Sex'])

In [None]:
# Split data into X and y, and select features to use
X = df_train.drop(['Survived', 'PassengerId'], axis=1)
y = df_train['Survived']
X_test = df_test.drop(['PassengerId'], axis=1)

In [None]:
# Split data X and y into train and val sets
X_train, X_val, y_train, y_val = train_test_split(
      X, y, test_size=0.2, random_state=42)

In [None]:
df_train.isnull().sum()

## Step 5: Train (or fit) the model

In [None]:
# Fit logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
# fit rf
rf = RandomForestClassifier(random_state=12)
rf.fit(X_train, y_train)

## Step 6: Evaluate the model

In [None]:
# define a scoring function
def acc(y: np.array, y_pred: np.array) -> float:
    return np.sum(y_pred==y)/len(y)

In [None]:
# Use model to predict on train and val sets
y_pred_trn_lr = lr.predict(X_train)
y_pred_val_lr = lr.predict(X_val)

y_pred_trn_rf = rf.predict(X_train)
y_pred_val_rf = rf.predict(X_val)

In [None]:
print('LR train set accuracy', acc(y_train, y_pred_trn_lr))
print('LR val set accuracy', acc(y_val, y_pred_val_lr))
print('RF train set accuracy', acc(y_train, y_pred_trn_rf))
print('RF val set accuracy', acc(y_val, y_pred_val_rf))

## Step 7: Generate Submission

In [None]:
# Predict for test set

y_pred_test = rf.predict(X_test)

# Create a Kaggle submission 
sub = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                    'Survived': y_pred_test})

sub.to_csv('week_3_baseline.csv', index=False)