### Univariate Distribution of Features

#### Load and Examine Data

In [None]:
# Import libraries and modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
# Read in data, examine first few rows
df = pd.read_csv("../data/train.csv")
pd.set_option('display.max_columns', None)
df.head(5)

In [None]:
print("This data set frame has {} rows and {} columns".format(df.shape[0], df.shape[1]))

In [None]:
# Check the percent of values in ech column that are missing. 
# We see that there are eight features with 20% or more missing values,
# and two features with over 50% missing values.
# The two column which encode our target (efs and efs_time)
# have no missing values.

pd.DataFrame(df.isna().sum()/df.shape[0] * 100).reset_index() \
    .rename(columns={"index":"Feature", 0:"Percent Missing"}) \
    .sort_values(by="Percent Missing", ascending=False)

In [None]:
# For now, we will keep all rows and columns--even those with missing data.
# We drop the ID column, which does not contain information useful for modeling.
# We also replace the numerical values in the efs column
# with text values, which are easier to interpret.

df = df.drop("ID", axis=1)
df['efs'] = df['efs'].replace({0:"Censored", 1:"Event"})

In [None]:
# Check which datatypes exist in the data.
# We have dtype 'O' (for 'object', pandas categorical datatype)
# as well as integer and floating-point datatypes.
# We will confirm that the datatype for each column makes sense
# when we examine the columns individually.
df.dtypes.unique()

In [None]:
# Compute basic descriptive statistics for numerical variables.
# It appears that many of these features take just a few 
# integer values. 

df_numeric = df.select_dtypes(['float64', 'int64'])
df_numeric.describe()

#### Define Helper Functions to Summarize and Plot Features

In [None]:
# Function that takes the name of a discrete feature
# and produces a barplot of the number of 
# cases for each value of the feature.
# May be used for either categorical features
# or integer features which only take a
# few distinct values.

def plot_discrete_feature(feat_name, df=df):
    fig, ax = plt.subplots(figsize=(5, 3))
    sns.countplot(df, x = feat_name)
    plt.xlabel(feat_name)
    plt.ylabel("Number of cases ")
    plt.title("Number of cases by {}".format(feat_name))
    plt.show()

In [None]:
# Function the takes the name of a discrete feature
# and returns a dataframe with the percentage
# of cases that take on each level of the feature.
# Note that this ignores any cases where the
# value of the feature is missing.

def get_percentages(feat_name, df=df):
    percentages = df[feat_name].value_counts()/df[feat_name].count() * 100
    return round(pd.DataFrame(percentages).reset_index(), 2) \
        .rename(columns={"count":"percent"}) \
        .sort_values(by="percent", ascending=False)

In [None]:
# Function that takes the name of a numeric feature
# and produces a figure with two subplots:
# a boxplot of the feature, and a histogram.
# Boxplots are often useful for detecting outliers,
# while histograms give more insight into the
# overall shape of a distribution.

def plot_numeric_feature(feat_name, data = df, bins=10):
    fig, ax = plt.subplots(1, 2, figsize=(10, 3))
    fig.suptitle("Distribution of {}".format(feat_name))
    sns.boxplot(data = df, y=feat_name, ax=ax[0])
    sns.histplot(data = df, x=feat_name, ax=ax[1], kde=True, bins=bins)
    plt.subplots_adjust(wspace=0.4)
    ax[1].set_ylabel("Number of cases")
    plt.show()

In [None]:
# While this notebook is mostly for univariate analysis,
# I am including a function here which plots efs_time (survival time)
# broken down by efs (event/censored).
# These two variables can be viewed as jointly encoding
# our true modeling objective (survival time).

def plot_efs_vs_efs_time(bins=10):
    fig, ax = plt.subplots(1, 2, figsize=(12, 3))
    fig.suptitle("Distribution of efs_time by efs")
    sns.boxplot(data = df, y="efs_time", hue="efs", ax=ax[0])
    sns.histplot(data = df, x="efs_time", hue="efs", ax=ax[1], multiple='stack', bins=bins)
    plt.subplots_adjust(wspace=0.3)
    ax[0].legend([], [], frameon=False)
    ax[1].set_ylabel("Number of cases")
    plt.show()