## **Gradient Boosting with large neural networks**

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import pandas as pd

## **Preprocessing**

In [11]:
df = pd.read_csv("./titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
y_df = df['Survived']
y = y_df.values
X_df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
X_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


### **Encoding**

In [13]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [14]:
def one_hot_encode_feature(df, feature):
    if df[feature].isna().any():
        one_hot_encoder = OneHotEncoder(sparse_output=False)
        one_hot_encoded = one_hot_encoder.fit_transform(df[[feature]])[:, :-1] # Get rid of last row
        nan_entries = np.where((one_hot_encoded == [0, 0, 0]).all(axis=1))
        one_hot_encoded[nan_entries] = [np.nan, np.nan, np.nan]
        one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out([feature])[:-1])
    else:
        one_hot_encoder = OneHotEncoder(sparse_output=False)
        one_hot_encoded = one_hot_encoder.fit_transform(df[[feature]])
        one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out([feature]))
    return pd.concat([df.drop(feature, axis=1), one_hot_encoded], axis=1)

In [15]:
X_df = one_hot_encode_feature(X_df, 'Embarked')
X_df = one_hot_encode_feature(X_df, 'Pclass')

In [16]:
label_enc = LabelEncoder()
X_df['Sex'] = label_enc.fit_transform(X_df['Sex'])

### **Taking care of missing data**

In [17]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)
X_df_imputed = pd.DataFrame(imputer.fit_transform(X_df), columns=X_df.columns)
X_df_imputed.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1.0,22.0,1.0,0.0,7.25,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,26.0,0.0,0.0,7.925,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,35.0,1.0,0.0,53.1,0.0,0.0,1.0,1.0,0.0,0.0
4,1.0,35.0,0.0,0.0,8.05,0.0,0.0,1.0,0.0,0.0,1.0


In [18]:
X_df_imputed.isna().any()

Sex           False
Age           False
SibSp         False
Parch         False
Fare          False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
Pclass_1      False
Pclass_2      False
Pclass_3      False
dtype: bool

In [19]:
X = X_df_imputed.values

### **Splitting dataset**

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model training**

### **Boosted Neural Network**

In [21]:
X.shape

(891, 11)

In [22]:
# Custom wrapper for Keras model to handle sample weights to work with the sklearn AdaBoost
class KerasSampleWeightWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, build_fn, epochs=10, batch_size=10, verbose=1):
        self.build_fn = build_fn
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.model_ = None
        self.classes_ = []
        self.n_classes_ = 0

    def fit(self, X, y, sample_weight=None):
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        self.model_ = self.build_fn()
        sample_weight = sample_weight.reshape(sample_weight.shape[0], -1)[:, 0]
        self.model_.fit(X, y, sample_weight=sample_weight.reshape(-1, 1), epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
        self.model_.evaluate(X, y)
        return self

    def predict(self, X):
        pred_probs = self.model_.predict(X).reshape(-1)
        #return pred_probs
        return (pred_probs > 0.5).astype('int32')

    def predict_proba(self, X):
        pred_probs = self.model_.predict(X, verbose = 0)
        return np.hstack([1 - pred_probs, pred_probs])

In [23]:
def create_keras_model():
    model = Sequential()
    model.add(Dense(30, input_dim=11, activation='relu', kernel_regularizer=keras.regularizers.L2(1e-5)))
    model.add(Dense(15, activation='relu', kernel_regularizer=keras.regularizers.L2(1e-5)))
    model.add(Dense(5, activation='relu', kernel_regularizer=keras.regularizers.L2(1e-5)))
    model.add(Dense(1, activation='linear'))

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(learning_rate=5e-3), metrics=['accuracy'])
    return model

keras_model = KerasSampleWeightWrapper(create_keras_model, verbose= 0, epochs=50, batch_size=10)

In [24]:
ada_model = AdaBoostClassifier(estimator=keras_model, n_estimators=50, learning_rate=1, algorithm='SAMME')

# Train the AdaBoost model
ada_model.fit(X_train, y_train)



In [25]:
y_pred = ada_model.predict(X_train)

accuracy = accuracy_score(y_train, y_pred)
print(f'Train Accuracy: {accuracy * 100:.2f}%')

Train Accuracy: 80.90%


In [26]:
y_pred = ada_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 79.89%


### **XgBoost**

In [27]:
from xgboost import XGBClassifier

In [63]:
clf = XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.01, objective='binary:logistic', )
clf.fit(X_train, y_train)

In [64]:
clf.predict(X_test[0:5])

array([0, 0, 0, 1, 1])

In [65]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9325842696629213
0.8156424581005587


## **XGBoost Wins! Way too efficient**