In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np


In [None]:
def encode_cabin(row):
    if row["Cabin"] == None:
        return 0
    else:
        return 'abcdefghijklmnopqrstuvwxyz'.index(str(row["Cabin"])[0].lower())


def extract_cabin_number(row):
    num = ''
    
    for i in str(row["Cabin"]):
        if i.isnumeric():
            num += i
            
    if num == '':
        return 0
            
    return int(num)
    

def encode_title(row):
    if "master" in row["Name"].lower():
        return 1
    elif "mr." in row["Name"].lower():
        return 2
    elif "mrs." in row["Name"].lower():
        return 3
    elif "miss." in row["Name"].lower():
        return 3
    else:
        return 0



def train(data, encode_cabin, extract_cabin_number, encode_title):
    
    '''
    function for training random forest classifier model
    
    data: features and label of train data
    '''
    
    data["Age"].fillna(value=data["Age"].mean(), inplace=True)
    data["sex_encoded"] = data.apply(lambda r: 0 if r["Sex"] == 'male' else 1, axis=1)
    data["Embarked"].fillna(value='S', inplace=True)
    data["embarked_encoded"] = data.apply(lambda r: 'SCQ'.index(r["Embarked"]), axis=1)
    data["title_encoded"] = data.apply(encode_title, axis=1)
    data["cabin_letter"] = data.apply(encode_cabin, axis=1)
    data["cabin_number"] = data.apply(extract_cabin_number, axis=1)
    
    
    del data['Name']
    del data['Ticket']
    del data['Sex']
    del data['PassengerId']
    del data['Cabin']
    del data['Embarked']
    
    label = data['Survived']
    del data['Survived']
    train = data.copy()
    
    model = RandomForestClassifier(n_estimators=10, random_state=42)
    
    model.fit(np.array(train), np.array(label))
    
    return(model)

def predict(data, model):
    
        '''
    function for training random forest classifier model
    data:data to be predicted from only containing features
    trained: fitted sklearn model
    '''
    
    data["Age"].fillna(value=data["Age"].mean(), inplace=True)
    data["sex_encoded"] = data.apply(lambda r: 0 if r["Sex"] == 'male' else 1, axis=1)
    data["Embarked"].fillna(value='S', inplace=True)
    data["embarked_encoded"] = data.apply(lambda r: 'SCQ'.index(r["Embarked"]), axis=1)
    data["title_encoded"] = data.apply(encode_title, axis=1)
    data["cabin_letter"] = data.apply(encode_cabin, axis=1)
    data["cabin_number"] = data.apply(extract_cabin_number, axis=1)
    
    
    del data['Name']
    del data['Ticket']
    del data['Sex']
    del data['PassengerId']
    del data['Cabin']
    del data['Embarked']
    
    
    predictions = model.predict(np.array(data))
    
    return(predictions)
    
    

In [2]:
data = pd.read_csv('../../data/train.csv')

In [3]:
data.shape

(891, 12)

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Feature Exploration + Feature Engineering

In [5]:
data["Age"].fillna(value=data["Age"].mean(), inplace=True)

In [6]:
data["sex_encoded"] = data.apply(lambda r: 0 if r["Sex"] == 'male' else 1, axis=1)

In [7]:
data["Embarked"].fillna(value='S', inplace=True)

In [8]:
data["embarked_encoded"] = data.apply(lambda r: 'SCQ'.index(r["Embarked"]), axis=1)

In [9]:
def encode_cabin(row):
    if row["Cabin"] == None:
        return 0
    else:
        return 'abcdefghijklmnopqrstuvwxyz'.index(str(row["Cabin"])[0].lower())

data["cabin_letter"] = data.apply(encode_cabin, axis=1)

In [10]:
def extract_cabin_number(row):
    num = ''
    
    for i in str(row["Cabin"]):
        if i.isnumeric():
            num += i
            
    if num == '':
        return 0
            
    return int(num)
    
data["cabin_number"] = data.apply(extract_cabin_number, axis=1)

In [11]:
def encode_title(row):
    if "master" in row["Name"].lower():
        return 1
    elif "mr." in row["Name"].lower():
        return 2
    elif "mrs." in row["Name"].lower():
        return 3
    elif "miss." in row["Name"].lower():
        return 3
    else:
        return 0

data["title_encoded"] = data.apply(encode_title, axis=1)
del data['Name']
del data['Ticket']
del data['Sex']
del data['PassengerId']
del data['Cabin']
del data['Embarked']

In [12]:
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,sex_encoded,embarked_encoded,cabin_letter,cabin_number,title_encoded
0,0,3,22.0,1,0,7.25,0,0,13,0,2
1,1,1,38.0,1,0,71.2833,1,1,2,85,3
2,1,3,26.0,0,0,7.925,1,0,13,0,3
3,1,1,35.0,1,0,53.1,1,0,2,123,3
4,0,3,35.0,0,0,8.05,0,0,13,0,2


In [13]:
label = data['Survived']
del data['Survived']
train = data.copy()

In [14]:
model = RandomForestClassifier(n_estimators=10, random_state=42)

In [15]:
model.fit(np.array(train), np.array(label))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [16]:
preds = model.predict(np.array(train))

In [17]:
from sklearn.metrics import f1_score


it works

In [18]:
f1_score(label, preds)

0.9662261380323055

In [None]:
train function
predict function