Import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


Read Data

In [15]:
def read_data(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath)
        assert(df.empty != True)
        return df
    except FileNotFoundError as e:
        print(f"File not found at {filepath}\n{e}")
        return None;

global separate 
separate = '\n********************************\n'

def print_header(message: str) -> None:
    """
    Print a header
    'message' should include \n if needed
    :param message: message to be printed
    :return: Nothing, purely for o/p
    """
    print(
        separate,
        message,
        separate
    )

Explore data

In [19]:
def visualize_data(df: pd.DataFrame) -> None:
    """
    copied from explore_data because im tired of looking at graphs in my o/p
    :param df: titanic dataset, expected
    :return: Nothing, display graphs only
    """
    sns.histplot(df['Age'], stat='count', bins=10)
    #mostly aged 20-30s

    plt.figure()
    sns.kdeplot( df[df.Sex=='female']['Age'], color='red')
    sns.kdeplot( df[df.Sex=='male']['Age'], color='blue', fill=True)
    #More young (10 yr old) females than males

    plt.figure()
    sns.violinplot(data=df, hue='Survived', y='Age', palette={0: 'r', 1: 'g'})
    #fatter around the younger ages, meaning they had a slightly higher chance of surviving

    print(separate, round(df[ ['Pclass', 'Age'] ]
                    .groupby(['Pclass'])
                    .mean(), 1) )

    plt.figure()
    sns.violinplot(data=df, hue='Survived', y='Embarked', palette={0: 'r', 1: 'g'})

def explore_data(df: pd.DataFrame) -> None:
    """
    Explore the titanic dataset itself,
    and see about getting some insight
    into missing values / correlations between
    features
    :param df: titanic training dataset, expected
    :return: nothing, purely exploratory
    """
    print_header("Exploring Data")

    #print .info()
    print_header(".info")
    df.info()

    #print the percentage of missing values from each column
    print_header('% Missing Values / Features')
    print(round(
              df.isnull().sum().sort_values(ascending=False) / len(df) * 100, 1))
    #77 percent of cabin information missing, so it's not even worth the effort to fill it
    #20 percent of ages missing, need to fill
    #barely any embarks missing, can fill with the mode (most occurring)

    #See average survivability per Pclass
    print_header('% Survived / Pclass')
    print(round(
              df[ ['Pclass', 'Survived'] ]\
                           .groupby('Pclass')\
                           .mean() * 100, 1))
    
    #see average survivability per Pclass + Sex
    print_header('% Survived / Pclass + Sex')
    print(round(
              df[ ['Pclass', 'Sex', 'Survived'] ]\
                           .groupby( ['Pclass', 'Sex'] )\
                           .mean() * 100, 0) )
    #1st Class Females -> 97 percent survived
    #2nd Class Females -> 92 percent survived
    #3rd Class Females -> 50 percent survived
    ##
    #1st Class Males -> 37 percent survived
    #2nd Class Males -> 16 percent survived
    #3rd Class Males -> 14 percent survived
    ##
    ##so females had wayyy higher chance of surviving than males at any level
    ##higher pclass = higher chance

    ###this was data visualization

    print_header('Passenger Port Embarkation Counts')
    print(
        "Passengers Embarked from S :: ", len( df[ (df.Embarked == 'S')] ), '\n',
        "Passengers Embarked from C :: ", len( df[ (df.Embarked == 'C')] ), '\n',
        "Passengers Embarked from Q :: ", len( df[ (df.Embarked == 'Q')] ) )

    print_header('% Survived / Pclass + Sex + Embarked')
    print(round(
              df[ ['Pclass', 'Survived', 'Embarked', 'Sex'] ]\
                    .groupby( ['Pclass', 'Embarked', 'Sex'] )\
                    .mean() * 100, 1) )   
    #seems as if males who embarked from Q had terrible chances of survival compared to others
    #lets look into this

    print_header('Embarked from Q + Male')
    print(df[ (df.Embarked == 'Q' ) & (df.Sex == 'male') ][ ['Pclass', 'Fare'] ].describe() )
    #average Pclass == 3, so that explains the anomaly. 
    #min Pclass is 1 however, and 100 percent of the males who embarked from Q in Pclass 1 expired, lets have a look at those passengers.
    #Crew maybe?

    print_header('Number of Passengers /  Pclass 1 + Port Q')
    print(len( df[ (df.Embarked == 'Q' ) & (df.Sex == 'male' ) & (df.Pclass == 1)] ))
    #only 1 1st class passenger got on at Q...and he died so that explains that
    #what about Pclass 2

    print_header('Number of Passengers /  Pclass 2 + Port Q')
    print(len(df[ (df.Embarked == 'Q' ) & (df.Sex == 'male' ) & (df.Pclass == 2)]))
    #only one as well, and he died too

    print_header('Number of Passengers /  Pclass 3 + Port Q')
    print(len(df[ (df.Embarked == 'Q' ) & (df.Sex == 'male' ) & (df.Pclass == 3)]))
    #only 3 / 39 males from Pclass = 3 and Embarked == Q lived



In [26]:
def explore_fare(df: pd.DataFrame) -> None:
    """
    Investigate the fare feature within the titanic dataset,
    and the correlations between the fare and areas of interest
    such as average ages, place of embarkation, and passenger class
    :param df: titanic training dataset, expected
    :return: Nothing, purely exploratory
    """
    print_header("Exploring Fare Column")
    #lets print the .describe() for some insight
    print_header("Fare .describe()")
    print(df['Fare'].describe())
    #average = 32
    #min = 0
    #so lets investigate the 0 cases

    print_header('Number of 0-Fare Passengers')
    print(len( df[ df.Fare == 0 ] ))
    #only 15 cases, lets look at the data

    #print(separate, df[ df.Fare == 0 ])
    #all males
    #all got on at Port S
    #traveling alone, no siblings or parents (Sibsp / Parch both 0)
    print_header('Average Pclass / Age / Survive of 0-Fare Passengers')
    print(df[ df.Fare == 0 ][ ['Pclass', 'Age', 'Survived'] ]
                    .mean())         
    #Pclass = 1.9
    #Age = 35
    #Survived = .06
    #So on average 2nd Class middle-aged men, and expired 
    #probably the crew, moving on
    
    #lets look at some cheap fares
    print_header('Number of Sub-$8 Fare Passengers')
    sub_8_fare = df['Fare'] < 8.0
    print(len( df[ sub_8_fare ] ))
    #241 samples, far too many, lets go down to 7
    #i should exclude the crew too

    print_header('Number of Sub-$7 Fare Passengers')
    sub_7_fare = df['Fare'] < 7.1
    exclude_crew = df['Fare'] != 0
    print(len( df[ (sub_7_fare) & (exclude_crew) ] ))
    #28 samples, we can work that that, lets see some info
    
    #lets see sex distribution
    print_header('Percent Male')
    print(round(
              len( df[ (sub_7_fare) & (exclude_crew) & (df.Sex == 'male') ])  / len( df[ (sub_7_fare) & (exclude_crew) ]) * 100, 1 ))

    print_header('Percent Female')
    print(round(
              len( df[ (sub_7_fare) & (exclude_crew) & (df.Sex == 'female') ])  / len( df[ (sub_7_fare) & (exclude_crew) ]) * 100,1 ))
    #95 percent male

    #okay so...cheap fare + mostly male, lets see if they are 3rd class
    print_header('Average Pclass / Sub-$7 Fare')
    print(round(
              df[ (sub_7_fare) & (exclude_crew) ][ ['Pclass', 'Survived'] ]\
                    .groupby( ['Pclass'])\
                    .mean()*100, 1))     
    
    #I do see Pclass = 1, lets see
    print_header('Pclass 1 Sub-$7 Fare Total')
    print(len( df[ (sub_7_fare) & (exclude_crew) & (df.Pclass == 1) ] ))
    #only 1 instance okay swag, outlier case
    #3 missing ages here, lets see 

    print_header('Average Age of Sub-$7 Riders')
    print(round(
              df[ (sub_7_fare) & (exclude_crew)]['Age']
                    .mean(), 0) )         
    #middle-aged men, 31 years old
    #can be useful for filling in missing ages, but for only 3 doesn't seem that beneficial lets
    #see if we can generalize better

    print_header('Number of Passengers on Sub-$9 Fare')
    sub_9_fare = df['Fare'] < 9.0
    print(len( df[ (sub_9_fare) & (exclude_crew) ] ))
    #296 people - bigger group okay

    print_header('Average Numerics for Sub-$9 Passengers')
    print(round(
              df[ (sub_9_fare) & (exclude_crew)][ ['Pclass', 'Age', 'SibSp', 'Parch', 'Survived'] ]
                    .mean(), 1))
    #entirely 3rd class
    #middle-aged
    #almost all died
    #lets see the number of children

    print_header('Number of children ( < 14 )')
    print(len( df[ (sub_9_fare) & (exclude_crew) & (df.Age < 14)] ))
    #2

    print_header('Number of elders ( > 50 )')
    print(len( df[ (sub_9_fare) & (exclude_crew) & (df.Age > 50)] ))
    #9
    #so out of 296 sub-9 dollar fares, 285 are between 14 and 50
    #how many are missing ages

    print_header('Number of missing ages + Sub-$9 Fare')
    print(df[ (sub_9_fare) & (exclude_crew) ]['Age'].isnull().sum())
    #91 missing ages...

    print_header('Average Age + Count  / Pclass + Sex + Port (Sub-$9 Fare)')
    print(round(
        df[ (sub_9_fare) & (exclude_crew)][ ['Pclass', 'Age', 'Sex', 'Embarked']]
                .groupby( ['Pclass', 'Sex', 'Embarked'] )
                .agg( ['mean', 'count']), 1))
    #can fill 91 missing ages / 177 based on this information


Data Cleaning

In [107]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean Data
        - fill missing values
        - Drop redundant / insignificant features
        -
    :param df: uncleaned titanic training set, expected
    :return: cleaned titanic training set
    """
    #Drop Cabin Column
    #   -too many missing values
    #   -filling would be inaccurate / meaningless
    df = df.drop('Cabin', axis=1)

    #Fill Missing Embarked
    #   -barely missing any data
    #   -filling with mode bc overhead of doing anything else is not worth the information it provides
    embarked_mode = df['Embarked'].mode()[0]
    df.loc['Embarked'] = embarked_mode

    #Fill Missing Ages
    #
    #
    missing_age = df['Age'].isna()
    name_includes_master = df['Name'].str.contains('Master')
    sub_9_fare = df.Fare < 9
    exclude_crew = df != 0

    #calculate the mean age
    master_mean = round(
        df[name_includes_master]['Age'].mean(), 0)

    #fill the instances with the mean
    df.loc[ name_includes_master & missing_age, 'Age'] = master_mean

    #reset boolean mask
    missing_age = df['Age'].isna()
    print(df[name_includes_master & missing_age])

    #print(df[ (df['Age'].isna()) & sub_9_fare & exclude_crew])




In [108]:
def main(argc: int, argv: str) -> int:
    #define filepath and read the training data
    filepath = r'.venv/data/train.csv'
    train_data = read_data(filepath)
    assert(train_data is not None) #ensure read_data func did not fail

    #explore the train data
#explore_data(train_data)
#explore_fare(train_data)
    #drop cabin
    #Higher PClass -> higher chance
    #Male + Sub-1st class -> low chance

    clean_data(train_data)
    return 0
main(0, '')

Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Embarked]
Index: []


0