In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
tf.__version__

'2.0.0'

Download the files(train and eval) from the storage and save it

In [3]:
TRAIN_DATA_URL="https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL="https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [5]:
train_file_path, test_file_path

('C:\\Users\\rohit\\.keras\\datasets\\train.csv',
 'C:\\Users\\rohit\\.keras\\datasets\\eval.csv')

--

You can load this using pandas, and pass the NumPy arrays to TensorFlow. If you need to scale up to a large set of files, or need a loader that integrates with TensorFlow and tf.data then use the tf.data.experimental.make_csv_dataset function:

In [87]:
# define the label column
LABEL_COLUMN = 'survived'

In [88]:
def get_dataset(file_path, **kwargs):
    '''this will get the csv from the filepath and creates the dataset'''
    dataset = tf.data.experimental.make_csv_dataset(file_path, batch_size=5, label_name=LABEL_COLUMN, na_value='?', 
                                                   num_epochs=1, ignore_errors=True, **kwargs)
    return dataset

In [89]:
raw_train_data = get_dataset(train_file_path)

In [90]:
raw_test_data = get_dataset(test_file_path)

Each item in the dataset is a batch, represented as a tuple of (many examples, many labels). The data from the examples is organized in column-based tensors (rather than row-based tensors), each with as many elements as the batch size (5 in this case).

In [91]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print(f"{key}:{value.numpy()}")
#         print(label)

In [92]:
show_batch(raw_train_data)

sex:[b'male' b'female' b'female' b'male' b'female']
age:[28. 27. 39. 34. 27.]
n_siblings_spouses:[1 1 1 1 0]
parch:[0 0 1 0 2]
fare:[16.1    13.8583 83.1583 26.     11.1333]
class:[b'Third' b'Second' b'First' b'Second' b'Third']
deck:[b'unknown' b'unknown' b'E' b'unknown' b'unknown']
embark_town:[b'Southampton' b'Cherbourg' b'Cherbourg' b'Southampton' b'Southampton']
alone:[b'n' b'n' b'n' b'n' b'n']


As you can see, the columns in the CSV are named. The dataset constructor will pick these names up automatically. If the file you are working with does not contain the column names in the first line, pass them in a list of strings to the column_names argument in the make_csv_dataset function.

In [32]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

sex:[b'male' b'male' b'female' b'female' b'male']
age:[ 1. 28. 35. 28. 22.]
n_siblings_spouses:[1 0 1 1 0]
parch:[2 0 1 0 0]
fare:[20.575 52.    20.25  24.     7.125]
class:[b'Third' b'First' b'Third' b'Second' b'Third']
deck:[b'unknown' b'A' b'unknown' b'unknown' b'unknown']
embark_town:[b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Southampton']
alone:[b'n' b'y' b'n' b'n' b'y']


This example is going to use all the available columns. If you need to omit some columns from the dataset, create a list of just the columns you plan to use, and pass it into the (optional) select_columns argument of the constructor.

In [33]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)

age:[45. 28. 15.  4. 28.]
n_siblings_spouses:[0 0 1 4 0]
class:[b'First' b'Third' b'Third' b'Third' b'Third']
deck:[b'unknown' b'F' b'unknown' b'unknown' b'unknown']
alone:[b'y' b'y' b'n' b'n' b'y']


data

In [34]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, 
                           select_columns=SELECT_COLUMNS,
                           column_defaults = DEFAULTS)

show_batch(temp_dataset)


age:[57.  4. 35. 21. 28.]
n_siblings_spouses:[0. 1. 0. 0. 0.]
parch:[0. 1. 0. 1. 0.]
fare:[10.5    23.      8.05   77.2875  8.05  ]


In [35]:
example_batch, labels_batch = next(iter(temp_dataset)) 

In [36]:
example_batch

OrderedDict([('age',
              <tf.Tensor: id=723, shape=(5,), dtype=float32, numpy=array([28., 21., 37., 30., 28.], dtype=float32)>),
             ('n_siblings_spouses',
              <tf.Tensor: id=725, shape=(5,), dtype=float32, numpy=array([0., 2., 0., 0., 1.], dtype=float32)>),
             ('parch',
              <tf.Tensor: id=726, shape=(5,), dtype=float32, numpy=array([0., 2., 0., 0., 0.], dtype=float32)>),
             ('fare',
              <tf.Tensor: id=724, shape=(5,), dtype=float32, numpy=array([  7.8958, 262.375 ,   9.5875,  31.    ,  15.5   ], dtype=float32)>)])

In [37]:
labels_batch

<tf.Tensor: id=727, shape=(5,), dtype=int32, numpy=array([0, 1, 0, 1, 1])>

In [94]:
def pack(features, labels):
    return tf.stack(list(features.values()), axis=-1), labels

In [95]:
packed_dataset = temp_dataset.map(pack)

In [96]:
for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[35.      0.      0.     26.55  ]
 [50.      0.      0.     28.7125]
 [42.      1.      0.     27.    ]
 [51.      0.      0.      7.0542]
 [28.      0.      0.      7.8958]]

[1 0 0 0 0]


more general preprocessor that selects a list of numeric features and packs them into a single column:

##### use raw_train(this will have numeric and categorical values), packnumericFeatures(this will pack all the numeric cols), normlize- use numeric features


##### use categorical_column_with_vocablist to find the categorical values


#### Generate (clamp all the numeric cols) and generate the numeric colums,(packed_train/test_data, this is used for training)

#### Using 'numeric' defined in the above step, generate the tf.feature_column.numeric_feature (numeric columns)

#### Similarily define categorical_values as key:value pair and use tf.feature_column.categorical_column_with_vocabulary_list to generate categorical columns

##### Use the above generated categorical and numerical columns to define the preprocessing layer

In [41]:
class PackNumericFeatures():
    def __init__(self, names):
        self.names = names
        
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features
        return features, labels

In [42]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [44]:
show_batch(packed_train_data)

sex:[b'female' b'male' b'male' b'male' b'male']
class:[b'Second' b'First' b'Third' b'Third' b'First']
deck:[b'unknown' b'B' b'E' b'unknown' b'E']
embark_town:[b'Southampton' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton']
alone:[b'n' b'y' b'y' b'n' b'n']
numeric:[[ 4.      1.      1.     23.    ]
 [24.      0.      0.     79.2   ]
 [32.      0.      0.      8.05  ]
 [28.      3.      1.     25.4667]
 [27.      1.      0.     53.1   ]]


In [97]:
example_batch, labels_batch = next(iter(packed_train_data))
example_batch

OrderedDict([('sex',
              <tf.Tensor: id=15925, shape=(5,), dtype=string, numpy=array([b'male', b'male', b'male', b'male', b'male'], dtype=object)>),
             ('class',
              <tf.Tensor: id=15921, shape=(5,), dtype=string, numpy=array([b'Second', b'Third', b'Second', b'Third', b'Third'], dtype=object)>),
             ('deck',
              <tf.Tensor: id=15922, shape=(5,), dtype=string, numpy=
              array([b'unknown', b'unknown', b'unknown', b'unknown', b'unknown'],
                    dtype=object)>),
             ('embark_town',
              <tf.Tensor: id=15923, shape=(5,), dtype=string, numpy=
              array([b'Southampton', b'Southampton', b'Southampton', b'Southampton',
                     b'Southampton'], dtype=object)>),
             ('alone',
              <tf.Tensor: id=15920, shape=(5,), dtype=string, numpy=array([b'n', b'n', b'y', b'n', b'y'], dtype=object)>),
             ('numeric',
              <tf.Tensor: id=15924, shape=(5, 4), dtyp

Data Normalization

In [47]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()

In [48]:
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [49]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [50]:
def normalize_numeric_data(data, mean, std):
    return (data-mean)/std

In [51]:
import functools

In [71]:
normalize = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_columns = tf.feature_column.numeric_column('numeric', normalizer_fn=normalize, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_columns]
numeric_columns

[NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x0000019125DC1708>, mean=array([29.63130781,  0.54545455,  0.37958533, 34.38539856]), std=array([12.51181763,  1.1510896 ,  0.79299921, 54.5977305 ])))]

In [72]:
example_batch['numeric']

<tf.Tensor: id=817, shape=(5, 4), dtype=float32, numpy=
array([[31.    ,  0.    ,  0.    ,  7.8542],
       [20.    ,  0.    ,  0.    ,  7.8542],
       [28.    ,  0.    ,  0.    ,  6.95  ],
       [30.    ,  0.    ,  0.    ,  7.2292],
       [30.    ,  0.    ,  0.    ,  7.225 ]], dtype=float32)>

In [73]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)

In [74]:
numeric_layer(example_batch).numpy()

array([[ 0.10939197, -0.47385937, -0.4786705 , -0.4859396 ],
       [-0.7697768 , -0.47385937, -0.4786705 , -0.4859396 ],
       [-0.13038135, -0.47385937, -0.4786705 , -0.5025007 ],
       [ 0.02946753, -0.47385937, -0.4786705 , -0.49738696],
       [ 0.02946753, -0.47385937, -0.4786705 , -0.49746388]],
      dtype=float32)

Categorical Data

In [59]:
CATEGORIES={
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [101]:
categorical_columns=[]
for feature, vocab in CATEGORIES.items():
    cat_col=tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=vocab)
#     print(cat_col)
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [70]:
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [69]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

[1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [102]:
numeric_columns

[NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x0000019125DC1708>, mean=array([29.63130781,  0.54545455,  0.37958533, 34.38539856]), std=array([12.51181763,  1.1510896 ,  0.79299921, 54.5977305 ])))]

In [103]:
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

Building the model

In [78]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns + numeric_columns)

In [79]:
print(preprocessing_layer(example_batch).numpy()[0])

[ 1.          0.          0.          0.          1.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.10939197 -0.47385937 -0.4786705  -0.4859396   0.          1.        ]


In [99]:
packed_train_data

<MapDataset shapes: (OrderedDict([(sex, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,)), (numeric, (None, 4))]), (None,)), types: (OrderedDict([(sex, tf.string), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string), (numeric, tf.float32)]), tf.int32)>

Model

In [81]:
model = tf.keras.Sequential([
    preprocessing_layer, 
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer = 'adam', metrics=['accuracy'])

In [82]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [84]:
model.fit(train_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x19127574bc8>

In [85]:
test_loss, test_accuracy = model.evaluate(test_data)

print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))



Test Loss 0.5182937042049642, Test Accuracy 0.8371211886405945


In [86]:
predictions = model.predict(test_data)

# Show some results
for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
  prediction = tf.sigmoid(prediction).numpy()
  print("Predicted survival: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("SURVIVED" if bool(survived) else "DIED"))

Predicted survival: 49.79%  | Actual outcome:  SURVIVED
Predicted survival: 5.73%  | Actual outcome:  DIED
Predicted survival: 98.47%  | Actual outcome:  SURVIVED
Predicted survival: 4.68%  | Actual outcome:  SURVIVED
Predicted survival: 5.69%  | Actual outcome:  DIED
