In [179]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Pipeline at a glance:
load data -> cleaning/preprocessing -> feature eng -> format data for modelling -> fit model -> evaluate model -> generate submission file

## Step 1: Load data

In [180]:
df_tr = pd.read_csv('../input/train.csv')
df_t = pd.read_csv('../input/test.csv')

# all data
df = (pd.concat((df_tr, df_t), ignore_index=True))


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


## Step 2: Basic cleaning and preprocessing

In [181]:
# Fill in missing values
medFare = pd.concat((df_tr['Fare'], df_t['Fare']), axis=0).median()

df['Fare'] = df['Fare'].fillna(medFare)
df['Embarked'] = df['Embarked'].fillna('S')

## Step 3: Feature engineering

In [182]:
# Functions that returns the title from a name. All the name in the dataset has the format "Surname, Title. Name"
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
    
all_titles = []

for name in df['Name']:
    title = get_title(name)
    all_titles.append(title)

df['Title'] = all_titles
     
df.loc[(df['Title'] != 'Mr') & (df['Title'] != 'Mrs') & (df['Title'] != 'Miss') & (df['Title'] != 'Master'), 'Title'] = 'Misc'

# See value counts of each title
pd.value_counts(df['Title'])

Mr        757
Miss      260
Mrs       197
Master     61
Misc       34
Name: Title, dtype: int64

In [183]:
# so i guess grouped is of length train+test. which is i think where things are getting screwed up
grouped = df.groupby(['Title'])  

# opted for median value to iron out any weirdness title selection might've given
grouped['Age'].median()

Title
Master     4.0
Misc      44.5
Miss      22.0
Mr        29.0
Mrs       35.5
Name: Age, dtype: float64

In [184]:
# fill those gaps
df['Age'] = df['Age'].fillna(grouped.Age.apply(lambda x: x.fillna(x.median())))

In [185]:
# find fare bins based on frequency rather than value
garbage, fare_bins = pd.qcut(df['Fare'], 4, retbins=True)
# bin age
df['Age_binned'] = pd.cut(df['Age'], [0,16,32,48,64,200], labels = [0,1,2,3,4], retbins=False)
    
# bin fare
df['Fare_binned'] = pd.cut(df['Fare'], fare_bins, labels = [0,1,2,3], include_lowest=True, retbins=False)
    
# family features
df['Family_size'] = df['SibSp'] + df['Parch']
df['Is_Alone'] = (df['Family_size'] == 0).astype(int)

In [186]:
# drop unneeded rows
df.drop(['Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Fare_binned', 'Embarked', 'Is_Alone'], axis=1, inplace=True)
# need to keep passengerID for submission
# now dropping embarked and is_alone, since they're probably useless

## Step 4: Format data for modelling

In [187]:
# label encode sex since binary
le = LabelEncoder()
le.fit(df['Sex'])
df.loc[:,'Sex'] = le.transform(df['Sex'])

# one hot encode title
df = pd.get_dummies(df, columns=['Title'])

In [188]:
# split back into train and test sets
df_train = df[(df['Survived']==1.0) | (df['Survived']==0.0)]
df_test = df.loc[pd.isna(df.Survived)]
df_test.reset_index(drop=True)

Unnamed: 0,PassengerId,Pclass,Sex,Survived,Age_binned,Family_size,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,892,3,1,,2,0,0,0,0,1,0
1,893,3,0,,2,1,0,0,0,0,1
2,894,2,1,,3,0,0,0,0,1,0
3,895,3,1,,1,0,0,0,0,1,0
4,896,3,0,,1,2,0,0,0,0,1
5,897,3,1,,0,0,0,0,0,1,0
6,898,3,0,,1,0,0,0,1,0,0
7,899,2,1,,1,2,0,0,0,1,0
8,900,3,0,,1,0,0,0,0,0,1
9,901,3,1,,1,2,0,0,0,1,0


In [189]:
# Split data into X and y, and select features to use
X = df_train.drop(['Survived', 'PassengerId'], axis=1)
y = df_train['Survived']
X_test = df_test.drop(['PassengerId', 'Survived'], axis=1)

In [190]:
# Split data X and y into train and val sets
X_train, X_val, y_train, y_val = train_test_split(
      X, y, test_size=0.2, random_state=42)

## Step 5: Train (or fit) the model

In [191]:
# Fit logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [192]:
# fit rf
rf = RandomForestClassifier(random_state=12)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=12, verbose=0, warm_start=False)

## Step 6: Evaluate the model

In [193]:
# define a scoring function
def acc(y: np.array, y_pred: np.array) -> float:
    return np.sum(y_pred==y)/len(y)

In [194]:
# Use model to predict on train and val sets
y_pred_trn_lr = lr.predict(X_train)
y_pred_val_lr = lr.predict(X_val)

y_pred_trn_rf = rf.predict(X_train)
y_pred_val_rf = rf.predict(X_val)

In [195]:
print('LR train set accuracy', acc(y_train, y_pred_trn_lr))
print('LR val set accuracy', acc(y_val, y_pred_val_lr))
print('RF train set accuracy', acc(y_train, y_pred_trn_rf))
print('RF val set accuracy', acc(y_val, y_pred_val_rf))

LR train set accuracy 0.8314606741573034
LR val set accuracy 0.8100558659217877
RF train set accuracy 0.8525280898876404
RF val set accuracy 0.8044692737430168


## Step 7: Generate Submission

In [196]:
# fit on whole dataset
lr.fit(X, y)

# Predict for test set
y_pred_test = lr.predict(X_test)

# Create a Kaggle submission
sub = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                    'Survived': y_pred_test})
sub.Survived = sub.Survived.astype('int64')

sub.to_csv('kaelanweek3.csv', index=False)



In [197]:
sub

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,1
893,894,0
894,895,0
895,896,1
896,897,0
897,898,1
898,899,0
899,900,1
900,901,0
