In [188]:
import tensorflow as tf
import os
import pandas as pd
import matplotlib.pyplot as plt

In [189]:
data = pd.read_csv(os.path.join("data", "train.csv"))

In [190]:
data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], inplace=True)

In [191]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [192]:
data["Family"] = data["Parch"] + data["SibSp"]
data.drop(columns=["Parch", "SibSp"], inplace=True)

In [193]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family
0,0,3,male,22.0,7.25,1
1,1,1,female,38.0,71.2833,1
2,1,3,female,26.0,7.925,0
3,1,1,female,35.0,53.1,1
4,0,3,male,35.0,8.05,0


In [194]:
data["Age"] = data["Age"].fillna(data["Age"].median())

In [195]:
X_data = data.drop(columns=["Survived"])
y_data = data["Survived"]

In [196]:
X_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    int64  
 1   Sex     891 non-null    object 
 2   Age     891 non-null    float64
 3   Fare    891 non-null    float64
 4   Family  891 non-null    int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [197]:
X_data["Sex"] = X_data["Sex"].map({"male": 0, "female": 1})

In [198]:
dataset = tf.data.Dataset.from_tensor_slices((X_data.values, y_data.values)).batch(32)

In [199]:
len(dataset)

28

In [200]:
train_size = int(0.7 * len(dataset))
val_size = int(0.1 * len(dataset)) + 1
test_size = int(0.2 * len(dataset)) + 1
train_size, val_size, test_size

(19, 3, 6)

In [201]:
train_data = dataset.take(train_size)
val_data = dataset.skip(train_size).take(val_size)
test_data = dataset.skip(train_size + val_size).take(test_size)

In [202]:
from tensorflow_decision_forests.keras import GradientBoostedTreesModel
import tensorflow_decision_forests as tfdf
from tensorflow.keras.optimizers import Adam

In [203]:
model = GradientBoostedTreesModel()
model.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=["accuracy"])

Use /tmp/tmpxxjxqunb as temporary training directory


W0000 00:00:1753172251.091502  168958 gradient_boosted_trees.cc:1873] "goss_alpha" set but "sampling_method" not equal to "GOSS".
W0000 00:00:1753172251.091574  168958 gradient_boosted_trees.cc:1883] "goss_beta" set but "sampling_method" not equal to "GOSS".
W0000 00:00:1753172251.091577  168958 gradient_boosted_trees.cc:1897] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".


In [204]:
history = model.fit(train_data, validation_data=val_data)

Reading training dataset...
Training dataset read in 0:00:00.152006. Found 608 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(96, shape=(), dtype=int32)
Validation dataset read in 0:00:00.103305. Found 96 examples.
Training model...


I0000 00:00:1753172251.368351  168958 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1753172251.368397  168958 kernel.cc:783] Collect training examples
I0000 00:00:1753172251.368406  168958 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1753172251.368495  168958 kernel.cc:401] Number of batches: 19
I0000 00:00:1753172251.368503  168958 kernel.cc:402] Number of examples: 608
I0000 00:00:1753172251.368533  168958 kernel.cc:802] Training dataset:
Number of records: 608
Number of columns: 6

Number of columns by type:
	NUMERICAL: 5 (83.3333%)
	CATEGORICAL: 1 (16.6667%)

Columns:

NUMERICAL: 5 (83.3333%)
	1: "data:0.0" NUMERICAL mean:

Model trained in 0:00:00.318190
Compiling model...
Model compiled.


I0000 00:00:1753172251.671823  181782 early_stopping.cc:54] Early stop of the training because the validation loss does not decrease anymore. Best valid-loss: 0.865071
I0000 00:00:1753172251.671938  181782 gradient_boosted_trees.cc:1669] Create final snapshot of the model at iteration 76
I0000 00:00:1753172251.674075  181782 gradient_boosted_trees.cc:279] Truncates the model to 47 tree(s) i.e. 47  iteration(s).
I0000 00:00:1753172251.674193  181782 gradient_boosted_trees.cc:341] Final model num-trees:47 valid-loss:0.865071 valid-accuracy:0.822917
I0000 00:00:1753172251.674601  181782 kernel.cc:926] Export model in log directory: /tmp/tmpxxjxqunb with prefix 3ccecdcbf8a24878
I0000 00:00:1753172251.676042  181782 kernel.cc:944] Save model in resources
I0000 00:00:1753172251.677544  168958 abstract_model.cc:921] Model self evaluation:
Task: CLASSIFICATION
Label: __LABEL
Loss (BINOMIAL_LOG_LIKELIHOOD): 0.865071

Accuracy: 0.822917  CI95[W][0 1]
ErrorRate: : 0.177083


Confusion Table:
trut

In [205]:
history.history

{'val_loss': [0.4325355291366577], 'val_accuracy': [0.8229166865348816]}

In [206]:
test_dataset = pd.read_csv(os.path.join("data", "test.csv"))

In [207]:
test_dataset.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], inplace=True)

In [208]:
test_dataset["Family"] = test_dataset["Parch"] + test_dataset["SibSp"]
test_dataset.drop(columns=["Parch", "SibSp"], inplace=True)

In [209]:
test_dataset["Age"] = test_dataset["Age"].fillna(test_dataset["Age"].median())

In [210]:
test_dataset["Sex"] = test_dataset["Sex"].map({"male": 0, "female": 1})

In [211]:
test_dataset.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Family
0,3,0,34.5,7.8292,0
1,3,1,47.0,7.0,1
2,2,0,62.0,9.6875,0
3,3,0,27.0,8.6625,0
4,3,1,22.0,12.2875,2


In [212]:
predictions = model.predict(test_dataset)



In [214]:
predictions
predictions_binary = (predictions > 0.5).astype(int)

In [215]:
test_dataset_copy = pd.read_csv(os.path.join("data", "test.csv"))

In [216]:
submission = pd.DataFrame({
    'PassengerId': test_dataset_copy['PassengerId'],
    'Survived': predictions_binary.flatten()
})

In [217]:
submission.to_csv(os.path.join("data", "submission2.csv"), index = False)