# Titanic Functions
This is a seperate functions file for the Titanic Analysis, the aim of seperating these were to keep the focus on the analysis and ML Models.

In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
def tester(string):
    """
    This takes any string and returns a message of success, purely to test of the function file is accessible.
    """
    ret = string + " This was a succesful test"
    return ret

def engineer_title(df):
    """
    Takes the titanic dataframe with NO MISSING VALUES and creates a collumn for Title.
    """
    ##### Title #####
    title = []
    titles = [' Mr.', 'Capt.', ' Col.', ' Don.', ' Dr.', ' Lady.', ' Sir.', ' Mrs.', ' Miss.', ' Major.', ' Master.', ' Mlle.', ' Mme.', ' Rev.', ' Countess.', ' Jonkheer.', ' Ms.']
    for values in df['Name']:
        if any(ele in values for ele in titles):
            title.append(re.search(r'(Mr\.|Capt\.|Col\.|Don\.|Dr\.|Lady\.|Sir\.|Mrs\.|Miss\.|Major\.|Master\.|Mlle\.|Mme\.|Rev\.|Countess\.|Jonkheer\.|Ms\.)', values).group())
        else:
            title.append("No Match")

    #print(cabin_fc.count())
    df['Title'] = title
    return df


def engineer_age_cat(df):
    """
    Takes the titanic dataframe with NO MISSING VALUES and creates a collumn for age category.
    """
    cat = []
    for index, row in df.iterrows():
        if df.at[index, 'Age'] < 12:
            cat.append("child " + str(df.at[index, 'Sex']))
        elif (df.at[index, 'Age'] >= 12) & (df.at[index, 'Age'] < 19):
            cat.append("teenager " + str(df.at[index, 'Sex']))
        else:
            cat.append("adult " + str(df.at[index, 'Sex']))
    df['AgeCat'] = cat
    return df
            
def engineer_family_members(df):
    """
    Takes the titanic dataframe with NO MISSING VALUES and creates a collumn for the number of family members.
    """
    df['FamilyMembers'] = df['Parch'] + df['SibSp'] + 1
    return df

def engineer_family_classification(df):
    """
    Takes the titanic dataframe with NO MISSING VALUES and creates a collumn for family classification.
    """
    famcat = []
    for index, row in df.iterrows():
        if df.at[index, 'FamilyMembers'] == 1:
            famcat.append("Single")
        elif df.at[index, 'FamilyMembers'] > 3:
            famcat.append("LargeFamily")
        else:
            famcat.append("SmallFamily")
    df['FamilyClassification'] = famcat
    return df
    

In [3]:
def fill_age(df):
    """
    Takes the titanic dataframe with the Title feature engineered and fills NaN's for Age.
    """
    conditions = [(df['Sex'] == 'female') & (df['Title'] == 'Countess.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Dr.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Lady.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Miss.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Mlle.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Mme.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Mrs.'),
             (df['Sex'] == 'female') & (df['Title'] == 'Ms.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Capt.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Col.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Don.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Dr.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Johnkheer.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Major.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Master.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Mr.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Rev.'),
             (df['Sex'] == 'male') & (df['Title'] == 'Sir.')]
    values = [33.0, 49.0, 48.0, 21.773972602739725, 24.0, 24.0, 35.898148148148145, 28.0, 70.0, 58.0, 40.0, 40.6, 38.0, 48.5, 4.574166666666667, 32.368090452261306, 43.166666666666664, 49.0]
    
    df['Age'] = np.where(df['Age'].isnull(), np.select(conditions, values), df['Age'])
    return df

def fill_cabin_location(df):
    """
    Takes the titanic dataframe and fills Cabin Location.
    """
    df['Cabin'].fillna('0', inplace = True)
    df['CabinLocation'] = df.Cabin.astype(str).str[0]
    df['CabinLocation'].replace('T', 'A', inplace = True)
    df['CabinLocation'].replace(['A', 'B', 'C'], 'ABC', inplace = True)

    # split datasets into train and prediciton
    df_train = df[df['CabinLocation'] != "0"].copy()
    df_pred = df[df['CabinLocation'] == "0"].copy()

    # Assign X & y
    X = df_train[['Pclass', 'Fare']]
    X = pd.get_dummies(X, columns = ['Pclass'])
    y = df_train['CabinLocation']

    # Split Data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    # Train Random Forest
    from sklearn.ensemble import RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=5, max_features='sqrt')
    rf_model = rf_model.fit(X_train, y_train)

    # Impute Column
    df_split = df_pred[['Pclass', 'Fare']].copy()
    df_split = pd.get_dummies(df_split, columns = ['Pclass'])
    #df_pred.drop(columns = ['CabinLocation'], inplace = True)
    df_pred['CabinLocation'] = rf_model.predict(df_split)

    return_df = df_train.append(df_pred)
    
    return return_df

def fill_embarked(df):
    """
    Takes the titanic df and fills Embarked.
    """
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
    return df

def fill_fare(df):
    """
    Takes the titanic df and fills Fare.
    """
    df['Fare'].fillna(df['Fare'].mean(), inplace = True)
    return df