In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import math
from decision_tree import MyDecisionTreeClassifier

In [2]:
### Load the data
titanic = pd.read_csv('./data/titanic/train.csv')
titanic_test = pd.read_csv('./data/titanic/test.csv')

titanic_Y_df=titanic.loc[:, titanic.columns.isin(['Survived'])]
titanic_X_df=titanic.loc[:, ~titanic.columns.isin(['Survived',"Name","Ticket","Cabin","PassengerId","Embarked"])]
titanic_X_test_df=titanic_test.loc[:, ~titanic_test.columns.isin(['Survived',"Name","Ticket","Cabin",
                                                                  "PassengerId","Embarked"])]

def one_hot_encoding(df, col):
    new_col = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df,new_col], axis=1).drop([col], axis=1)
    return df

def fill_na_mean(df, col):
    df[col].fillna(df[col].mean(), inplace=True)

titanic_X_df = one_hot_encoding(titanic_X_df,"Sex")
titanic_X_df = one_hot_encoding(titanic_X_df,"Pclass")
for col in titanic_X_df:
    fill_na_mean(titanic_X_df, col)

titanic_X_test_df = one_hot_encoding(titanic_X_test_df,"Sex")
titanic_X_test_df = one_hot_encoding(titanic_X_test_df,"Pclass")
for col in titanic_X_test_df:
    fill_na_mean(titanic_X_test_df, col)
    
titanic_X=titanic_X_df.to_numpy()
titanic_X_test = titanic_X_test_df.to_numpy()
titanic_Y=titanic_Y_df.to_numpy().reshape(len(titanic_Y_df))

In [3]:
from utilities import bootstrap

class MyRandomForestClassifier:
    def __init__(self, max_depth=5, min_samples_leaf=2, n_estimator=10, criterion="gini",
                 min_impurity=0.00001):
        # Root node in dec. tree
        self.forest = [] 
        # Minimum n of samples to justify split
        self.min_samples_split = min_samples_leaf
        # The minimum impurity to justify split
        self.min_impurity = min_impurity
        # The maximum depth to grow the tree to
        self.max_depth = max_depth
        # number of trees in the random forest
        self.n_estimator = n_estimator
        
        self.max_features = None

        
    def fit(self, X, y):
        n_features = X.shape[1]
        self.max_features = int(math.sqrt(n_features))
        
        for i in range(self.n_estimator):
            X_sub, y_sub = bootstrap(X,y)
            col_index = np.random.choice(n_features, self.max_features, replace=True)
            tree = MyDecisionTreeClassifier(max_depth=5, min_samples_split=2)
            tree.fit(X_sub[:,col_index], y_sub)
            tree.feature_idx = col_index
            self.forest.append(tree)
            
    def predict(self, X):
        forest_predictions = np.ones((X.shape[0],self.n_estimator))
        for idx, tree in enumerate(self.forest):
            predictions = tree.predict(X[:, tree.feature_idx])
            forest_predictions[:, idx] = predictions
            
        vote_prediction = []
        for row in forest_predictions:
            vote = np.bincount(row.astype("int")).argmax()
            vote_prediction.append(vote)
            
        return np.array(vote_prediction)
            

In [4]:
my_rf_clf = MyRandomForestClassifier(max_depth=3, min_samples_leaf=2, n_estimator=100)
my_rf_clf.fit(titanic_X, titanic_Y)
my_prediction = my_rf_clf.predict(titanic_X_test)

In [5]:
###################### sklearn testing ####################################
rf_clf = RandomForestClassifier(max_depth=3, min_samples_leaf=2, n_estimators=100)
rf_clf.fit(titanic_X, titanic_Y)
prediction = rf_clf.predict(titanic_X_test)

In [7]:
sum([my_prediction[i]==prediction[i] for i in range(len(prediction))])/len(prediction)

0.8732057416267942