In [1]:
import numpy as np
from numpy.typing import NDArray
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
class Node:
    def __init__(self,value: np.float64,feature: int,answer: int = -1):   #<value
        self.value=value
        self.feature=feature
        self.answer=answer
        self.left=None
        self.right=None


class DecisionTree:
    def __init__ (self,depth:int):
        self.depth=depth
        self.node = Node(np.nan,-1,-1)

    def GiniCriteria(self, labels:NDArray[np.int32]):
        uniqueValues, counts = np.unique(labels, return_counts=True)
        return 1-np.sum((counts/labels.shape[0])**2)
    
    def lossFunction(self, fPartData:NDArray[np.float64],sPartData:NDArray[np.float64]):
        numberOfExamples = fPartData.shape[0]+sPartData.shape[0]
        return fPartData.shape[0]/numberOfExamples*self.GiniCriteria(fPartData[:,-1])+\
                    sPartData.shape[0]/numberOfExamples*self.GiniCriteria(sPartData[:,-1])

    def __fitRecursion(self,node:Node,data: NDArray[np.float64],depth):
        if depth == 0 or np.all(data[:,-1] == data[0,-1]):
            labels, counts = np.unique(data[:,-1], return_counts=True) 
            node.value = np.nan
            node.feature = -1
            node.answer = labels[np.argmax(counts)]
            return
            
        first=True
        minLoss=np.inf
        bestValue=0.0
        bestFeature=0

        chFeatures = np.random.choice(range(data.shape[1]-1), size=(data.shape[1]-1)//2, replace=False)#change 2-nd attr
        for feature in chFeatures:
            fragmentationValues = np.linspace(np.min(data[:,feature]),np.max(data[:,feature]) , num=50, dtype=np.float64)
            for value in fragmentationValues:
                fPartData = data[data[:,feature]<value]
                sPartData = data[data[:,feature]>=value]
                loss=self.lossFunction(fPartData,sPartData)
                if first:
                    minLoss=loss
                    first=False
                    bestValue,bestFeature=value,feature
                elif loss<minLoss:
                    minLoss = loss
                    bestValue,bestFeature=value,feature

        node.value = bestValue
        node.feature = bestFeature
        node.answer = np.nan
        node.left = Node(np.nan, -1, -1)
        node.right = Node(np.nan, -1, -1)
        self.__fitRecursion(node.left,data[data[:,node.feature]<node.value],depth-1)
        self.__fitRecursion(node.right,data[data[:,node.feature]>=node.value],depth-1)
    
    def fit(self,data: NDArray[np.float64]):
       self.__fitRecursion(self.node,data,self.depth)


    def predict(self,X:NDArray[np.float64])->int:
        temp = self.node.answer
        tempNode=self.node
        while np.isnan(temp):
            if X[tempNode.feature]<tempNode.value:
               tempNode = tempNode.left
            else:
               tempNode = tempNode.right
            temp = tempNode.answer
        return temp

In [3]:
class RandomForest:
    def __init__(self,modelsNumber:int,depth:int):
        self.modelsNumber=modelsNumber
        self.depth=depth
        self.modelList = [DecisionTree(self.depth) for _ in range(self.modelsNumber)]

    def fit(self,X:NDArray[np.float64]):
        for i in range(self.modelsNumber):
            self.modelList[i].fit(X[np.random.choice(X.shape[0], size=X.shape[0], replace=True)])

    def predict(self,X:NDArray[np.float64]):
        predicts=[]
        for i in range(self.modelsNumber):
            predicts.append(self.modelList[i].predict(X))
        values, counts = np.unique(np.array(predicts), return_counts=True)
        return values[np.argmax(counts)]
            

In [19]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = pd.concat([train, test], sort=False).reset_index(drop=True)
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [20]:
data['Age'].fillna(data.groupby(['Sex', 'Pclass'])['Age'].transform('median'), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 
                                       'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].map({"Master":0, "Miss":1, "Ms":1, "Mme":1, "Mlle":1, 
                                   "Mrs":1, "Mr":2, "Rare":3})
data['Title'].fillna(3, inplace=True)


data['FamilySize'] = data['SibSp'] + data['Parch'] + 1


data['IsAlone'] = 1
data.loc[data['FamilySize'] > 1, 'IsAlone'] = 0


data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)


data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

data['Fare'] = np.log1p(data['Fare'])

scaler = StandardScaler()
num_features = ['Age', 'Fare']
data[num_features] = scaler.fit_transform(data[num_features])


train_processed_t = data[:len(train)]
y_train= train_processed_t['Survived']
train_processed_t=train_processed_t.drop('Survived',axis=1)
train_processed_t['Survived'] = y_train
test_processed_t = data[len(train):]

train_processed_t.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,FamilySize,IsAlone,Embarked_Q,Embarked_S,Survived
886,2,0,-0.171147,0,0,-0.351432,3,1,1,0,1,0.0
887,1,1,-0.776601,0,0,0.469818,1,1,1,0,1,1.0
888,3,1,-0.549555,1,2,0.224602,1,4,0,0,1,0.0
889,1,0,-0.246828,0,0,0.469818,2,1,1,0,0,1.0
890,3,0,0.207262,0,0,-0.836997,2,1,1,1,0,0.0


In [21]:
test_processed_t.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title,FamilySize,IsAlone,Embarked_Q,Embarked_S
1304,,3,0,-0.32251,0,0,-0.80217,2,1,1,0,1
1305,,1,1,0.737034,0,0,1.777305,3,1,1,0,0
1306,,3,0,0.699194,0,0,-0.897786,2,1,1,0,1
1307,,3,0,-0.32251,0,0,-0.80217,2,1,1,0,1
1308,,3,0,-0.32251,1,1,0.177412,0,3,0,0,0


In [22]:
test_processed_t=test_processed_t.drop('Survived',axis=1)


In [63]:
rf = RandomForest(30,3)

In [64]:
rf.fit(train_processed_t.to_numpy())

In [65]:
y_pred=np.array([rf.predict(X) for X in (test_processed_t.to_numpy())])

In [66]:
y_test = pd.read_csv('submission.csv')
y_test = y_test.drop('PassengerId',axis=1)
accuracy = accuracy_score(y_test, y_pred)

In [67]:
accuracy

0.9545454545454546