In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from numpy.typing import NDArray
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import wandb

In [213]:
class GBBC:
    def __init__(self,depth:int,n_estimators:int,lr:float = 0.1):
        self.depth=depth
        self.n_estimators=n_estimators
        self.lr=lr
        self.treeList = []

    def __sigmoid(self,x: NDArray[np.float64])-> NDArray[np.float64]:     
        return 1/(1+np.exp(-x))
    
    def fit(self,X: NDArray[np.float64],y: NDArray[np.float64]):
        log_odds = np.full(y.shape[0], np.log((y == 1).sum()/(y == 0).sum())) #calculate first approximation of prediction
        self.log_odds=log_odds[0] 
        for _ in range(self.n_estimators):
            prob = self.__sigmoid(log_odds) #calculate probability             
            residue =  y - prob #calculate residues(dL/dy^) to train next model on them 

            #train next model of ensemble
            tree = DecisionTreeRegressor(max_depth=3, random_state=0)
            tree.fit(X, residue)
            self.treeList.append(tree)
                    
            leaf_indices = tree.apply(X)    
            unique_leaves = np.unique(leaf_indices)

            #calculating gammas to update predictions
            gammas = np.zeros(unique_leaves.shape[0])
            for i,leaf in enumerate(unique_leaves):
                indeces = np.where(leaf_indices == leaf)[0]
                gammas[i]=np.sum(residue[indeces])/np.sum(prob[indeces]*(1-prob[indeces]))
            
            leaf_to_gamma = {leaf: gamma for leaf, gamma in zip(unique_leaves, gammas)}

            log_odds += self.lr * np.array([leaf_to_gamma[leaf] for leaf in leaf_indices]) # update predictions
            #log_odds+=self.lr*gammas[leaf_indices]
    
    def predict(self, x: NDArray[np.float64]):
            prediction=np.full(x.shape[0], self.log_odds)

            for i in self.treeList:
                prediction += self.lr*i.predict(x)

            return self.__sigmoid(prediction) # transtlate prediction to (0,1)
     

In [170]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = pd.concat([train, test], sort=False).reset_index(drop=True)
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [171]:
data['Age'].fillna(data.groupby(['Sex', 'Pclass'])['Age'].transform('median'), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 
                                       'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].map({"Master":0, "Miss":1, "Ms":1, "Mme":1, "Mlle":1, 
                                   "Mrs":1, "Mr":2, "Rare":3})
data['Title'].fillna(3, inplace=True)


data['FamilySize'] = data['SibSp'] + data['Parch'] + 1


data['IsAlone'] = 1
data.loc[data['FamilySize'] > 1, 'IsAlone'] = 0


data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)


data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

data['Fare'] = np.log1p(data['Fare'])

scaler = StandardScaler()
num_features = ['Age', 'Fare']
data[num_features] = scaler.fit_transform(data[num_features])


train_processed_t = data[:len(train)]
y_train= train_processed_t['Survived']
train_processed_t=train_processed_t.drop('Survived',axis=1)
#train_processed_t['Survived'] = y_train
test_processed_t = data[len(train):]

train_processed_t.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,FamilySize,IsAlone,Embarked_Q,Embarked_S
886,2,0,-0.171147,0,0,-0.351432,3,1,1,0,1
887,1,1,-0.776601,0,0,0.469818,1,1,1,0,1
888,3,1,-0.549555,1,2,0.224602,1,4,0,0,1
889,1,0,-0.246828,0,0,0.469818,2,1,1,0,0
890,3,0,0.207262,0,0,-0.836997,2,1,1,1,0


In [172]:
test_processed_t=test_processed_t.drop('Survived',axis=1)

In [202]:
gb = GBBC(3,1000,lr=0.01)
test_processed_t.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,FamilySize,IsAlone,Embarked_Q,Embarked_S
891,3,0,0.396467,0,0,-0.827688,2,1,1,1,0
892,3,1,1.342488,1,0,-0.929576,1,2,0,0,1
893,2,0,2.477715,0,0,-0.630353,2,1,1,1,0
894,3,0,-0.171147,0,0,-0.734514,2,1,1,0,1
895,3,1,-0.549555,1,1,-0.405395,1,3,0,0,1


In [None]:
wandb.init(project="my-project")

In [203]:
gb.fit(train_processed_t.to_numpy(),y_train.to_numpy())

In [204]:
y_pred=gb.predict(test_processed_t.to_numpy())

In [211]:
y_test = pd.read_csv('submission.csv')
y_test = y_test.drop('PassengerId',axis=1)
accuracy = accuracy_score(y_test.astype(int), (np.where(y_pred>=0.44,1,0)).astype(int))

In [212]:
accuracy

0.8803827751196173

In [207]:
print((np.where(y_pred>=0.4,1,0)).astype(int))

[0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 1]


In [208]:
print(y_pred)

[0.30891868 0.34940744 0.3579842  0.34328464 0.41127789 0.31316071
 0.39572898 0.32303822 0.50594791 0.31087192 0.31095003 0.34249181
 0.5288757  0.34719241 0.51047141 0.51777519 0.31840969 0.32936401
 0.43492814 0.41416312 0.36886798 0.41552512 0.5484594  0.39273086
 0.53193265 0.31211437 0.55239667 0.32936401 0.41382749 0.32611706
 0.31444056 0.31836701 0.42064086 0.3862516  0.40185638 0.32891308
 0.39415689 0.39270792 0.31067721 0.39474682 0.29880967 0.39641694
 0.30906465 0.5086086  0.54466285 0.31194916 0.39370868 0.32476802
 0.53479175 0.44166481 0.36972501 0.33894082 0.50001059 0.50301323
 0.35843452 0.32748009 0.31793961 0.31006888 0.31087192 0.55549592
 0.31223572 0.33929292 0.3123669  0.47695576 0.51849333 0.51755424
 0.46188512 0.32950791 0.38633403 0.50546926 0.4568635  0.31141812
 0.3938542  0.39471316 0.55790064 0.36545034 0.31138853 0.50699683
 0.32817512 0.4568635  0.49138753 0.28114183 0.36831365 0.31095003
 0.35964909 0.31575529 0.46517366 0.39775497 0.46239275 0.5280