In [144]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Imports dependencies

In [145]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf

print (tfdf.__version__)

1.5.0


# Load dataset

In [146]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
serving_df = pd.read_csv("/kaggle/input/titanic/test.csv")
train_df.head(10)
print(train_df.shape)
print(serving_df.shape)


(891, 12)
(418, 11)


# Prepare dataset

In [147]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
preprocessed_training_def = preprocess(train_df)
preprocessed_serving_df = preprocess(serving_df)
preprocessed_training_def.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE
5,6,0,3,Moran Mr James,male,,0,0,330877,8.4583,,Q,330877,NONE
6,7,0,1,McCarthy Mr Timothy J,male,54.0,0,0,17463,51.8625,E46,S,17463,NONE
7,8,0,3,Palsson Master Gosta Leonard,male,2.0,3,1,349909,21.075,,S,349909,NONE
8,9,1,3,Johnson Mrs Oscar W Elisabeth Vilhelmina Berg,female,27.0,0,2,347742,11.1333,,S,347742,NONE
9,10,1,2,Nasser Mrs Nicholas Adele Achem,female,14.0,1,0,237736,30.0708,,C,237736,NONE


In [148]:
input_features = list(preprocessed_training_def.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")

print(f"input featurs {input_features}")

input featurs ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']


# Convert Pandas dataframe to TensorFlow Dataset

In [149]:
def Tokenize_names(features,labels = None):
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_training_def,label="Survived").map(Tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(Tokenize_names)

# Train a model with default parameters


In [150]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0,
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features = True,
    random_seed=3116,
)
model.fit(train_ds)
self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy : {self_evaluation.accuracy} Loss : {self_evaluation.loss} ")

[INFO 23-12-28 04:57:03.7033 UTC kernel.cc:1243] Loading model from path /tmp/tmpz11cmdd3/model/ with prefix 8368636617024c4b
[INFO 23-12-28 04:57:03.7107 UTC abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 23-12-28 04:57:03.7107 UTC kernel.cc:1075] Use fast generic engine


Accuracy : 0.8125 Loss : 0.8233121633529663 


# Train model with improved default parameters

In [None]:
model2 = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True, # Only use the features in "features"
    
    #num_trees=2000,
    
    # Only for GBT.
    # A bit slower, but great to understand the model.
    # compute_permutation_variable_importance=True,
    
    # Change the default hyper-parameters
    # hyperparameter_template="benchmark_rank1@v1",
    
    #num_trees=1000,
    #tuner=tuner
    
    min_examples=1,
    categorical_algorithm="RANDOM",
    #max_depth=4,
    shrinkage=0.05,
    #num_candidate_attributes_ratio=0.2,
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=2000,
    #validation_ratio=0.0,
    random_seed=1234,
    
)
model2.fit(train_ds)

self_evaluation = model2.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")



In [None]:
model2.summary()

# Make predictions

some debugging here

In [None]:
# #train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_training_def,label="Survived").map(Tokenize_names)
# #serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_training_def).map(Tokenize_names)
# serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(Tokenize_names)
# proba_survived = model2.predict(serving_ds, verbose=0)
# print(f"proba survived shape : {proba_survived.shape}")
# proba_survived = proba_survived[:,0]
# print(f"proba survived shape : {proba_survived.shape}")
# print(f"serving df shape : {serving_df.shape}")
# print(f"serving df shape : {serving_ds.shape}")

In [None]:
def prediction_to_kaggle_format(model, treshold = 0.5):
    proba_survived = model.predict(serving_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId" : serving_df["PassengerId"].values,
        "Survived" : (proba_survived >= treshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path = "/kaggle/working/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")
    
kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)

In [None]:
!head /kaggle/working/submission.csv

# Training a model with hyperparameter tunning

In [None]:
tuner = tfdf.tuner.RandomSearch(num_trials=1000)
tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])

local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])

global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])

tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])

tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",["NONE", "STANDARD_DEVIATION", "MIN_MAX"])

oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5])


tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=0)

tuned_self_evaluation = tuned_model.make_inspector().evaluation()
print(f"Accuracy {tuned_self_evaluation.accuracy}, Loss {tuned_self_evaluation.loss} ")

# Making an ensemble

In [None]:
predictions = 0
num_predictions = 0

for i in range (100):
    print(f"i: {i}")
    model = tfdf.keras.GradientBoostedTreesModel(
        verbose = 0,
        features = [tfdf.keras.FeatureUsage(name = n) for n in input_features],
        exclude_non_specified_features = True,
        random_seed=i,
        honest=True
    )
    model.fit(train_ds)
    sub_predictions = model.predict(serving_ds, verbose=0)[:,0]
    if predictions is None:
        predictions += sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1
    
predictions /= num_predictions
kaggle_predictions = pd.DataFrame({
    "PassengerId":serving_df["PassengerId"],
    "Survived":(predictions >= 0.5).astype(int)
})
make_submission(kaggle_predictions)