The Titanic had three cabin classes: first class was the most
expensive, second class was in the middle, and third class, or
steerage, was the least expensive and in the lower decks. It is
well documented that most passengers who survived were
female and in first-class cabins. We also know that gender and
class played an important role in the selection process for getting
on the lifeboats. That selection process prioritized women
and children over men. Because this background is so well
known, this dataset is suitable as a didactic example to investigate
model bias.

Follow through from Author *KC Tung*

In [1]:
# import Libraries
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_model_analysis as tfma

from sklearn.model_selection import train_test_split
from google.protobuf import text_format
import pandas as pd

In [2]:
train_file_path = r'C:\Users\DELL\.keras\datasets\train.csv'
test_file_path = r'C:\Users\DELL\.keras\datasets\eval.csv'

titanic_df = pd.read_csv(train_file_path, header='infer')
test_df = pd.read_csv(test_file_path, header='infer')

titanic_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
survived,627.0,0.38756,0.487582,0.0,0.0,0.0,1.0,1.0
age,627.0,29.631308,12.511818,0.75,23.0,28.0,35.0,80.0
n_siblings_spouses,627.0,0.545455,1.15109,0.0,0.0,0.0,1.0,8.0
parch,627.0,0.379585,0.792999,0.0,0.0,0.0,0.0,5.0
fare,627.0,34.385399,54.59773,0.0,7.8958,15.0458,31.3875,512.3292


In [3]:
# read CSV data with TF
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

train_ds = tf.data.experimental.make_csv_dataset(
    train_file_path, batch_size=3, label_name=LABEL_COLUMN, na_value='?',
    num_epochs=1, ignore_errors=True
)

test_ds = tf.data.experimental.make_csv_dataset(
    test_file_path, batch_size=3, label_name=LABEL_COLUMN, na_value='?',
    num_epochs=1, ignore_errors=True
)

In [4]:
# designate columns with Feature-columns
feature_columns = []

# numeric columns
for header in ['age', 'n_siblings_spouses', 'parch', 'fare']:
    feature_columns.append(tf.feature_column.numeric_column(header))


# binning the AGE column based on the quartiles in DESC above
age = tf.feature_column.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[23, 28, 35])

# checking the Unique values in the Categorical values
h = {}
for col in titanic_df:
    if col in ['sex', 'class', 'deck', 'embark_town', 'alone']:
        print(col, ': ', titanic_df[col].unique())
        h[col] = titanic_df[col].unique()


# how to enocde CAT columns & OHE it
sex_type = tf.feature_column.categorical_column_with_vocabulary_list(
    key='Type', vocabulary_list=['male', 'female']
)
sex_type_one_hot = tf.feature_column.indicator_column(sex_type)



# a better way to encode categorical values
sex_type = tf.feature_column.categorical_column_with_vocabulary_list(
    'Type', h.get('sex').tolist()
)
sex_type_one_hot = tf.feature_column.indicator_column(sex_type)


class_type = tf.feature_column.categorical_column_with_vocabulary_list(
    'Type', h.get('class').tolist()
)
class_type_one_hot = tf.feature_column.indicator_column(class_type)

deck_type = tf.feature_column.categorical_column_with_vocabulary_list(
    'Type', h.get('deck').tolist()
)
deck_type_one_hot = tf.feature_column.indicator_column(deck_type)

embark_town_type = tf.feature_column.categorical_column_with_vocabulary_list(
    'Type', h.get('embark_town').tolist()
)
embark_town_type_one_hot = tf.feature_column.indicator_column(embark_town_type)

alone_type = tf.feature_column.categorical_column_with_vocabulary_list(
    'Type', h.get('alone').tolist()
)
alone_type_one_hot = tf.feature_column.indicator_column(alone_type)


# DECK column has 8 unique value more than others, so reduce to 3 dimenstions
deck = tf.feature_column.categorical_column_with_vocabulary_list(
    'deck', titanic_df.deck.unique()
)
deck_embedding = tf.feature_column.embedding_column(deck, dimension=3)

# another way to reduce dimension
class_hashed = tf.feature_column.categorical_column_with_hash_bucket(
    'class', hash_bucket_size=4
)

# how to create a HASH Bucket
cross_type_feature = tf.feature_column.crossed_column(['sex', 'class'], hash_bucket_size=5)
feature_columns = [] # list to hold features to use

# append numeric columns
for header in ['age', 'n_siblings_spouses', 'parch', 'fare']:
    feature_columns.append(tf.feature_column.numeric_column(header))

age = tf.feature_column.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[23, 28, 35])
feature_columns.append(age_buckets)

# append categorical columns
indicator_column_names = ['sex', 'class', 'deck', 'embark_town', 'alone']
for col_name in indicator_column_names:
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(
        col_name, titanic_df[col_name].unique()
    )
    indicator_column = tf.feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)

# append embedding columns
deck = tf.feature_column.categorical_column_with_vocabulary_list(
    'deck', titanic_df.deck.unique()
)
deck_embedding = tf.feature_column.embedding_column(deck, dimension=3)
feature_columns.append(deck_embedding)

# append crossed columns
feature_columns.append(tf.feature_column.indicator_column(cross_type_feature))


# read the CSV files
train_df = pd.read_csv(r'C:\Users\DELL\.keras\datasets\train.csv')
test_df = pd.read_csv(r'C:\Users\DELL\.keras\datasets\eval.csv')


# now create a Feature Layer which would serve as the first input layer
feature_layer = layers.DenseFeatures(feature_columns)

val_df, test_df = train_test_split(test_df, test_size=0.4)
batch_size = 32
labels = train_df.pop('survived')
working_ds = tf.data.Dataset.from_tensor_slices((dict(train_df), labels))
working_ds = working_ds.shuffle(buffer_size=len(train_df))
train_ds = working_ds.batch(batch_size)

# functino to convert Pandas DF to Tensor slices
def pandas_to_dataset(df, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop('survived')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(batch_size=len(df))
    ds = ds.batch(batch_size)
    return ds

val_ds = pandas_to_dataset(val_df, shuffle=False, batch_size=batch_size)
test_ds = pandas_to_dataset(test_df, shuffle=False, batch_size=batch_size)

sex :  ['male' 'female']
class :  ['Third' 'First' 'Second']
deck :  ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town :  ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone :  ['n' 'y']


In [5]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dropout(.1),
    layers.Dense(1)
])
model.compile(
    optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x295acbdc898>

# Continue from here for Fairness Indicator Tutorial

In [6]:
prediction_raw = model.predict(test_ds)
prediction_raw[:5]

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


array([[ 1.2949723 ],
       [ 1.0698959 ],
       [-0.84793216],
       [-0.31190944],
       [-1.2318875 ]], dtype=float32)

In [7]:
# convert Predictions to Python list and append to TEST_DF
prediction_list = prediction_raw.squeeze().tolist()
test_df['predicted'] = prediction_list # adding Predicted Column to Dataframe

# put the PREDICTED column as the first column to compare with Survived
cols = list(test_df.columns)
cols = [cols[-1]] + cols[:-1]
test_df = test_df[cols]

test_df.head()

Unnamed: 0,predicted,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
205,1.294972,0,female,18.0,0,1,14.4542,Third,unknown,Cherbourg,n
78,1.069896,1,female,22.0,0,0,7.75,Third,unknown,Queenstown,y
229,-0.847932,0,male,28.0,0,0,7.225,Third,unknown,Cherbourg,y
223,-0.311909,0,male,18.0,0,0,11.5,Second,unknown,Southampton,y
0,-1.231887,0,male,35.0,0,0,8.05,Third,unknown,Southampton,y


In [8]:
# define the Evaluation configuration
eval_config = text_format.Parse("""
    model_specs {
        prediction_key: 'predicted',
        label_key: 'survived'
    }
    metrics_specs {
        metrics {class_name: 'AUC'}
        metrics {
            class_name: 'FairnessIndicators'
            config: '{"thresholds": [0.1, 0.50, 0.90]}'
        }
        metrics {class_name: 'ExampleCount'}
    }

    slicing_specs {
        feature_keys: ['sex', 'class']
    }

    slicing_specs {}
""", tfma.EvalConfig())

# specify path to output Model
OUTPUT_PATH = r'C:\Users\DELL\Desktop\Learning projects'

eval_result = tfma.analyze_raw_data(
    data=test_df, eval_config=eval_config, output_path=OUTPUT_PATH
)




Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [9]:
!jupyter nbextension enable tensorflow_model_analysis --user

Enabling notebook extension tensorflow_model_analysis...
      - Validating: problems found:
        - require?  X tensorflow_model_analysis


In [11]:
# rendering the Fairness Indicators

tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)
# this Widget correctly displayed on Jupyter notebook but won't on GitHub. 

FairnessIndicatorViewer(slicingMetrics=[{'sliceValue': 'Overall', 'slice': 'Overall', 'metrics': {'example_cou…

Different parameters where clicked and analyzed.