### Importing Libraries that we need in this project

In [55]:
# libraries 
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input ,BatchNormalization,Conv1D,MaxPooling1D ,Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
import tensorflow_decision_forests as tfdf

### Reading the Train and Test CSV files

In [56]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [57]:
# Checking out the train dataset
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Deleting .,()[] from Name culumn and also making two new Columns out of Ticket Columns

In [58]:
def prepering_data(df):
    df = df.copy()
    #name
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    #Ticket number
    def ticket_number(x):
        return x.split(" ")[-1]
    #Ticket lable
    def ticket_lable(x):
        items = x.split(' ')
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_lable)                     
    return df

preprocessed_train_df = prepering_data(train_df)
preprocessed_test_df = prepering_data(test_df)

In [59]:
#Checkout Name, and two new columns(Ticket_number and Ticket_item)
preprocessed_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE


### Droping out PassengerId and Ticket since they're not giving us any helpful informations. Also creating the features for out model

In [None]:
combine = [preprocessed_train_df, preprocessed_test_df]
Y = preprocessed_train_df['Survived']

for dataset in combine:
    dataset.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)
features = list(preprocessed_train_df.columns)
features.remove('Survived')

In [61]:
print(features)

['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']


In [62]:
# Helper function to tokenize the 'Name' column
def tokenize_names(features, labels=None):
    # Split full names into tokens (e.g., "Smith, Mr. John" → ["Smith,", "Mr.", "John"])
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

# Convert preprocessed training and test dataframes to TensorFlow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df,label="Survived").map(tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test_df).map(tokenize_names)

## Model 

In [64]:
# Initialize variables to collect predictions
predictions = None
num_predictions = 0

# Train the same model multiple times with different random seeds
# This helps reduce model variance (like bagging) and gives a more stable final prediction
for i in range(200):
    model = tfdf.keras.GradientBoostedTreesModel(
        verbose=0,      # suppress logs
        features=[tfdf.keras.FeatureUsage(name=n) for n in features],
        exclude_non_specified_features=True,
        random_seed=i,      # different seed each time to diversify trees
        honest=True,        # ensure fair training by using out-of-bag samples
    )
    model.fit(train_ds)
    
    # Predict on the serving set
    sub_predictions = model.predict(serving_ds, verbose=0)[:,0]
    # Aggregate predictions across all runs
    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

# Average the predictions over all models
predictions/=num_predictions

# Prepare final Kaggle submission file
kaggle_predictions = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": (predictions >= 0.5).astype(int)
    })

W0000 00:00:1743111904.450256 18509509 gradient_boosted_trees.cc:1873] "goss_alpha" set but "sampling_method" not equal to "GOSS".
W0000 00:00:1743111904.450284 18509509 gradient_boosted_trees.cc:1883] "goss_beta" set but "sampling_method" not equal to "GOSS".
W0000 00:00:1743111904.450286 18509509 gradient_boosted_trees.cc:1897] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
I0000 00:00:1743111904.640663 18509509 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1743111904.640682 18509509 kernel.cc:783] Collect training examples
I0000 00:00:1743111904.640686 18509509 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
column_guides {
  column_name_pattern: "^Pclass$"
}
column_guides {
  column_name_pattern: "^Name$"
}
column_guides {
  column_name_pattern: "^Sex$"
}
column_guides {
  column_name_pattern: "^Age$"
}
column

In [66]:
kaggle_predictions.to_csv('submission.csv', index=False)