Import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


Read Data

In [2]:
def read_data(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath)
        assert(df.empty != True)
        return df
    except FileNotFoundError as e:
        print(f"File not found at {filepath}\n{e}")
        return None;

global separate 
separate = '\n********************************\n'

def print_header(message: str) -> None:
    """
    Print a header
    'message' should include \n if needed
    :param message: message to be printed
    :return: Nothing, purely for o/p
    """
    print(
        '\n',
        message,
        separate
    )

Explore data

In [104]:
def percent_missing(df: pd.DataFrame) -> None:
    '''
    print the percentage of missing values
    per column in the provided dataset
    
    :param df: pandas dataframe 
    :return: None
    '''
    print(round(\
        df.isnull().sum().sort_values(ascending=False) / len(df) * 100\
        , 1))
    return


def visualize_numerical_data(df: pd.DataFrame, numerical_columns: list) -> None:
    '''
    visualize the numerical features passed as
    'numerical_columns' in the dataset 'df'
    histograms / correlation heatmap
    
    :param df: pandas dataframe
    :param numerical_columns: list of numerical interest features
    :return: 
    '''
    for feature in numerical_columns:
        plt.figure(figsize=(4,4))
        plt.hist(df[feature])
        plt.title(f'{feature}')
        plt.show()
    
    plt.figure(figsize=(4,4))
    sns.heatmap(df[numerical_columns].corr())
    
    return

def visualize_categorical_data(df: pd.DataFrame, categorical_columns: list) -> None:
    '''
    visualize the categorical features passed as
    'categorical_columns' in the dataset 'df'
    barplots 
    
    :param df: pandas dataframe 
    :param categorical_columns: list of categorical interest features 
    :return: None
    '''
    for feature in categorical_columns:
        plt.figure(figsize=(4,4))
        sns.barplot(df[feature].value_counts()).set_title(f'{feature}')
        plt.show()
    
    return 


In [117]:
def explore_data(df: pd.DataFrame) -> None:
    """
    Explore the titanic dataset itself,
    and see about getting some insight
    into missing values / correlations between
    features
    :param df: titanic training dataset, expected
    :return: nothing, purely exploratory
    """

    #print .info() to give a sense of where missing values are, 
    #the number of samples in the dataset, datatypes, etc
    print_header(".info")
    df.info()

    #print .describe() to give us some cool stat data from the
    #numerical columns in the dataset
    print_header(".describe()")
    df.describe()
    
    #print the percentage of missing values from each column
    print_header('% Missing Values / Features')
    percent_missing(df)
    #77 percent of cabin information missing, worth filling?
    #20 percent of ages missing, need to fill
    #barely any embarks missing, can fill with the mode/median
    
    print_header('Mean Feature / Survived ')
    numerical_columns = ['Age', 'SibSp', 'Parch', 'Fare']
    print(pd.pivot_table(df, index='Survived', values=numerical_columns))
    #Those who survived had more than double average fare 
    #Those who perished were slightly older 
    
    #See average survivability per Pclass
    print_header('% Survived / Pclass')
    print(round(
              df[ ['Pclass', 'Survived'] ]\
                           .groupby('Pclass')\
                           .mean() * 100, 1))
    
    #see average survivability per Pclass + Sex
    print_header('% Survived / Pclass + Sex')
    print(round(
              df[ ['Pclass', 'Sex', 'Survived'] ]\
                           .groupby( ['Pclass', 'Sex'] )\
                           .mean() * 100, 0) )
    #1st Class Females -> 97 percent survived
    #2nd Class Females -> 92 percent survived
    #3rd Class Females -> 50 percent survived
    ##
    #1st Class Males -> 37 percent survived
    #2nd Class Males -> 16 percent survived
    #3rd Class Males -> 14 percent survived
    ##
    ##so females had wayyy higher chance of surviving than males at any level
    ##higher pclass = higher chance

    print_header('Passenger Port Embarkation Counts')
    print(
        "Passengers Embarked from S :: ", len( df[ (df.Embarked == 'S')] ), '\n',
        "Passengers Embarked from C :: ", len( df[ (df.Embarked == 'C')] ), '\n',
        "Passengers Embarked from Q :: ", len( df[ (df.Embarked == 'Q')] ) )

    print_header('% Survived / Pclass + Sex + Embarked')
    print(round(
              df[ ['Pclass', 'Survived', 'Embarked', 'Sex'] ]\
                    .groupby( ['Pclass', 'Embarked', 'Sex'] )\
                    .mean() * 100, 1) )   
    #seems as if males who embarked from Q had terrible chances of survival compared to others
    
    #looked at so far :: Pclass / Sex / Embarked
    #need to look at :: Age, Fare, SibSp, Parch

In [19]:
def explore_fare(df: pd.DataFrame) -> None:
    """
    Investigate the fare feature within the titanic dataset,
    and the correlations between the fare and areas of interest
    such as average ages, place of embarkation, and passenger class
    :param df: titanic training dataset, expected
    :return: Nothing, purely exploratory
    """
    print_header("Exploring Fare Column")
    #lets print the .describe() for some insight
    print_header("Fare .describe()")
    print(df['Fare'].describe())
    #average = 32
    #min = 0
    #so lets investigate the 0 cases

    print_header('Number of 0-Fare Passengers')
    print(len( df[ df.Fare == 0 ] ))
    #only 15 cases, lets look at the data

    #print(separate, df[ df.Fare == 0 ])
    #all males
    #all got on at Port S
    #traveling alone, no siblings or parents (Sibsp / Parch both 0)
    print_header('Average Pclass / Age / Survive of 0-Fare Passengers')
    print(df[ df.Fare == 0 ][ ['Pclass', 'Age', 'Survived'] ]
                    .mean())         
    #Pclass = 1.9
    #Age = 35
    #Survived = .06
    #So on average 2nd Class middle-aged men, and expired 
    #probably the crew, moving on
    
    #lets look at cheap fares
    print_header('Number of Passengers on Sub-$9 Fare')
    sub_9_fare = df['Fare'] < 9.0
    print(len( df[ (sub_9_fare) & (exclude_crew) ] ))
    #296 people

    print_header('Average Numerics for Sub-$9 Passengers')
    print(round(
              df[ (sub_9_fare) & (exclude_crew)][ ['Pclass', 'Age', 'SibSp', 'Parch', 'Survived'] ]
                    .mean(), 1))
    #entirely 3rd class
    #middle-aged
    #almost all died
    #lets see the number of children

    print_header('Number of children ( < 14 )')
    print(len( df[ (sub_9_fare) & (exclude_crew) & (df.Age < 14)] ))
    #2

    print_header('Number of elders ( > 50 )')
    print(len( df[ (sub_9_fare) & (exclude_crew) & (df.Age > 50)] ))
    #9
    
    #so out of 296 sub-9 dollar fares, 285 are between 14 and 50
    #how many are missing ages

    print_header('Number of missing ages + Sub-$9 Fare')
    print(df[ (sub_9_fare) & (exclude_crew) ]['Age'].isnull().sum())
    #91 missing ages...

    print_header('Average Age + Count  / Pclass + Sex + Port (Sub-$9 Fare)')
    print(round(
        df[ (sub_9_fare) & (exclude_crew)][ ['Pclass', 'Age', 'Sex', 'Embarked']]
                .groupby( ['Pclass', 'Sex', 'Embarked'] )
                .agg( ['mean', 'count']), 1))
    #can fill 91 missing ages / 177 based on this information


Data Cleaning

In [254]:
def fix_ticket(df: pd.DataFrame) -> pd.DataFrame:
    '''
    fix the ticket column such that it only contains 
    numbers, no special characters 
    :param df: pandas dataframe
    :return: new dataframe containing adjusted ticket column
    '''
    df = df.drop('Ticket', axis=1)
    #df['Ticket'] = df['Ticket'].str.extract(r'(\d+)')
    
    return df

def fix_cabin(df: pd.DataFrame) -> pd.DataFrame:
    '''
    fix the cabin column such that it only contains
    a single letter indicating deck, or 'N' indicating no cabin
    :param df: pandas dataframe
    :return: new dataframe containing adjusted cabin column
    '''
    df['Cabin'] = df['Cabin'].str.extract(r'([A-Za-z])')
    df['Cabin'] = df['Cabin'].fillna('n')
    
    return df

def fill_master(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Fill the missing age values where 'Title'
    is 'Master'
    :param df: pandas dataframe
    :return: same dataframe with filled <null> values for 'Master' titles
    '''
    #define conditions
    missing_age = df['Age'].isna()
    name_includes_master = df['Title'] == 'Master'
    
    #calculate the mean age
    master_mean = round(
        df[name_includes_master]['Age'].mean(), 0)

    #fill the instances with the mean
    df.loc[ name_includes_master & missing_age, 'Age'] = master_mean
    
    return df

def fill_sub9(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Fill the msising age values where 'Fare'
    is less than $9. Split on male/female
    :param df: pandas dataframe
    :return: same dataframe with filled <null> values for 'Sub-$9 Fare' Passengers
    '''
    #define conditions
    missing_age = df['Age'].isna()
    sub_9_fare = df['Fare'] < 9
    exclude_crew = df['Fare'] != 0
    males_only = df['Sex'] == 'male'

    #Calculate the mean age for males
    sub_9_male_mean = round(
        df[ (exclude_crew) & (males_only) & (sub_9_fare) ]['Age'].mean()\
        ,0)
    
    #fill the instances with the mean 
    df.loc[ (missing_age) & (exclude_crew) & (males_only) & (sub_9_fare), 'Age'] = sub_9_male_mean


    #define female conditions
    females_only = df['Sex'] == 'female'

    #calculate the mean age for females
    sub_9_female_mean = round(
        df[ (exclude_crew) & (females_only) & (sub_9_fare)]['Age'].mean()\
        ,0)
    
    #fill the instances with the mean
    df.loc[ (missing_age) & (exclude_crew) & (females_only) & (sub_9_fare), 'Age'] = sub_9_female_mean
    
    return df

def fill_crew(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Fill the missing age values where 'Fare'
    is equal to 0, indicating a crew member
    :param df: pandas dataframe
    :return: same dataframe with filled <null> values for crew members
    '''
    #define conditions
    crew_only = df['Fare'] == 0
    missing_age = df['Age'].isna()
    
    #calculate average age
    crew_mean = round(
        df[ (crew_only) ]['Age'].mean()\
    ,0)
    
    #fill missing values
    df.loc[ (missing_age) & (crew_only), 'Age'] = crew_mean
    
    return df

In [252]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean Data
        - fill missing values
        - Drop redundant / insignificant features
        -
    :param df: uncleaned titanic training set, expected
    :return: cleaned titanic training set
    """
    #Drop Passenger ID
    #   - no use
    df = df.drop('PassengerId', axis=1)
    
    #Fix Cabin Column
    #   -Fill <null> with 'n'
    #   -Keep only Deck 
    df = fix_cabin(df=df)
    
    
    #Fix Ticket Column
    #   - only maintain the numerical part 
    #   - i will fix this later when i realize i need the ticket column for family groups and stuff
    df = fix_ticket(df=df)

    #Fill Missing Embarked
    #   -barely missing any data
    #   -filling with mode bc overhead of doing anything else is not worth the information it provides
    embarked_mode = df['Embarked'].mode()[0]
    df.loc[:, 'Embarked'] = df['Embarked'].fillna(embarked_mode)

    #Fill missing ages for 'Master' titles
    #   - 'Master' has much younger mean age
    #
    df = fill_master(df=df)

    #Fill missing ages for Sub-$9 Fare Passengers
    #   - contains 91 / 177 total <null> ages
    df = fill_sub9(df=df)
    
    #Fill crew
    #   - 
    df = fill_crew(df=df)
    
    #Fill remaining
    df['Age'] = df.groupby( ['Pclass', 'Title', 'Sex'] )['Age']\
                     .transform(lambda x: x.fillna(x.median() ))
    
    return df


Data Engineering

In [208]:
def extract_title(df: pd.DataFrame) -> pd.DataFrame:
    '''
    extract new 'Title' column given df 
    :param df: pandas dataframe
    :return: new dataframe containing new 'Title' column
    '''
    df['Title'] = df['Name'].str.extract(r',\s*([A-Za-z]+)')
    title_transform = {'Dr': 'Esteemed',
                       'Mr': 'Mr',
                       'Mme': 'Mr',
                       'Ms': 'Miss',
                       'Mrs': 'Mrs',
                       'Miss': 'Miss',
                       'Mlle': 'Miss',
                       'Master': 'Master',
                       'Jonkheer': 'Crew',
                       'Major': 'Crew',
                       'Col': 'Crew',
                       'Capt': 'Crew', 
                       'Don': 'Esteemed',
                       'Lady': 'Esteemed',
                       'Sir': 'Esteemed',
                       'the': 'Esteemed',
                       'Rev': 'Esteemed'}
    df['Title'] = df['Title'].map(title_transform)    

    return df

def add_familyflag(df: pd.DataFrame) -> pd.DataFrame:
    '''
    get family size given df
    :param df: pandas dataframe
    :return: new dataframe containing new 'FamilySize' column
    '''
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['LargeFamily'] = df['FamilySize'].apply(lambda x: 1 if x > 4 else 0)
    df = df.drop( ['FamilySize', 'SibSp', 'Parch'], axis=1)
    
    return df

In [239]:
def data_engineer(df: pd.DataFrame) -> pd.DataFrame:
    
    #Extract the useful part of each Name
    #   - 'Title'
    #   - 'Drop the 'Name' column
    df = extract_title(df=df)  
    df = df.drop(columns='Name', axis=1)
    
    #Get FamilySize
    df = add_familyflag(df=df)
            
    return df

Base Model / Model creation


In [210]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def all_die_model(df: pd.DataFrame) -> None:
    X = pd.DataFrame({'constant': np.ones(df.shape[0])} )
    y = df['Survived']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    all_die = LogisticRegression()
    
    all_die.fit(X_train, y_train)
    
    predictions = all_die.predict(X_test)
    
    #Test accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy: {accuracy:.2f}")

def women_live_model(df: pd.DataFrame) -> None:
    X = pd.DataFrame({'male': (df['Sex'] == 'male').astype(int)})
    y = df['Survived']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    women_live = LogisticRegression()
    
    women_live.fit(X_train, y_train)
    
    predictions = women_live.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    print(f'Accuracy: {accuracy:.2f}')

Putting it all together

In [263]:
def main(argc: int, argv: str) -> None:
    #define filepath and read the training data
    train_filepath = r'.venv/data/train.csv'
    test_filepath = r'.venv/data/test.csv'
    
    train_data = read_data(train_filepath)
    test_data = read_data(test_filepath)
    
    assert(train_data is not None) 
    assert(test_data is not None)

    train_data['is_train'] = 1
    test_data['is_train'] = 0
    test_data['Survived'] = np.nan

    combined_data = pd.concat( [train_data, test_data], ignore_index=True)

    numerical_columns = ['Age', 'Fare', 'SibSp', 'Parch']
    categorical_columns = ['Survived', 'Pclass', 'Sex', 'Embarked', 'Ticket', 'Cabin']

#    visualize_numerical_data(df=combined_data, numerical_columns=numerical_columns)
#    visualize_categorical_data(df=combined_data, categorical_columns=categorical_columns)

    #explore the training dataset
    explore_data(combined_data)
    
    combined_data = data_engineer(df=combined_data)
    combined_cleaned = clean_data(df=combined_data)
    percent_missing(combined_cleaned)
    
    train_cleaned = combined_cleaned[ combined_cleaned['is_train'] == 1]
    test_cleaned = combined_cleaned[ combined_cleaned['is_train'] == 0]
    
    train_cleaned = train_cleaned.drop('is_train', axis=1)
    test_cleaned = test_cleaned.drop('is_train', axis=1)


    return 
main(0, '')


 .info 
********************************

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
 12  is_train     1309 non-null   int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 133.1+ KB

 .describe() 
********************************


 % Missing Values / Features 
********************************

Cabin          77.5
Survived       31.9
Ag