# Importing the libraries

In [None]:
import numpy as np

from math import log2, sqrt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing
import matplotlib.pyplot as plt
import six
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus
from tqdm.notebook import tqdm_notebook as tqdm
from sklearn.model_selection import GridSearchCV

# Implementation from "Decision Trees.ipynb"
from decision_tree import MyDecisionTree

# Exploring the Dataset

In [None]:
columns_names = ['age', 'workclass', 'fnlwg', 'education', 'education_num', 'marital_status',
                 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
                 'country', 'target']
df_train = pd.read_csv("adult_data.csv", header = None, names = columns_names)
df_train.drop(columns="fnlwg", inplace=True)
df_train.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,target
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
columns_names = ['age', 'workclass', 'fnlwg', 'education', 'education_num', 'marital_status',
                 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
                 'country', 'target']
df_test = pd.read_csv("adult_test.csv", header = None, names = columns_names)
df_test.drop(columns="fnlwg", inplace=True)
df_test = df_test.iloc[1:, :]
df_test['age'] = pd.to_numeric(df_test['age'])
df_test.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,target
1,25,Private,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [None]:
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]
label_enc = LabelEncoder()
y_train = label_enc.fit_transform(y_train)

X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]
label_enc = LabelEncoder()
y_test = label_enc.fit_transform(y_test)

# Random Forest Implementation

In [None]:
class RandomForest(object):
    def __init__(self, n_estimators=50, max_depth=None, max_features=0.7):
        # How many estimators we are going to use?
        self.n_estimators = n_estimators
        # Max depth
        self.max_depth = max_depth
        # The percentage of the RANDOM features we are going to use
        self.max_features = max_features
        # Which examples are in the current bootstrap
        self.bootstraps_row_indices = []
        # Keeping track of features indices
        self.feature_indices = []
        # To calculate the out of bag error
        self.out_of_bag = []
        
        decision_trees = []
        for i in range(n_estimators):
            # Call our DT class
            decision_trees.append(MyDecisionTree(max_depth=max_depth))
        # Our random forest
        self.decision_trees = decision_trees
        # Or self.decision_trees = [MyDecisionTree(max_depth=max_depth) for i in range(n_estimators)]
        
    def _bootstrapping(self, num_training, num_features): ## _ is similar to private in other programming language
        """
        - INPUT : 
            num_training: how many training examples in this bootstrap
            num_features: how many features in this bootstrap
            
        - OUTPUT :
        - row_idx: the row indices corresponding to the row locations of the selected samples in the original dataset.
        - col_idx: the column indices corresponding to the column locations of the selected features  
                   in the original feature list.
        
        
        
        - Randomly select a sample dataset of size num_training with replacement from the original dataset. 
        - Randomly select certain number of features (num_features denotes the total number of features in X,
          without replacement from the total number of features.
        """ 
        
        sample_size = list(range(num_training))
        # Create random row indices
        row_idx = np.random.choice(sample_size,num_training)
        # Permutation (with replacement)
        col_idx = np.random.permutation(num_features)[:int(num_features*self.max_features)]
        return row_idx, col_idx
            
    def bootstrapping(self, num_training, num_features):
        """
        Initializing the bootstap datasets for each tree
        """
        
        for i in range(self.n_estimators):
            # We use a set to get the unique elements
            total = set(list(range(num_training)))
            row_idx, col_idx = self._bootstrapping(num_training, num_features)
            # Again we use a set. Subtract the row indices from the total
            total = total - set(row_idx)
            self.bootstraps_row_indices.append(row_idx)
            self.feature_indices.append(col_idx)
            # Total is used for the OOB
            self.out_of_bag.append(total) 
            
            
    def fit(self, X, y):
        """
        Train decision trees using the bootstrapped datasets.
        """
        
        num_training, num_features = X.shape
        # Initialize the bootstrapping
        self.bootstrapping(num_training,num_features)
        # Loop over the trees 
        for i in range(self.n_estimators):
            current_bootstraps_row_indices = self.bootstraps_row_indices[i]
            current_feature_indices = self.feature_indices[i]
            current_X = X[current_bootstraps_row_indices[:,np.newaxis], current_feature_indices] ## data for this tree
            current_y = y[current_bootstraps_row_indices]
            current_dt = self.decision_trees[i]
            # 0 for the initial depth
            current_dt.fit(current_X,current_y, 0)
            # Which tree we are using
            print("Current Tree to fit : " ,i+1)
            
            
    def OOB_score(self, X, y):
        """
        Calculate the OOB score
        """
        
        accuracy = []
        # Loop over the full dataset
        for i in range(len(X)):
            predictions = []
            # Loop over each decision tree
            for t in range(self.n_estimators):
                # The data that is NOT used in the current tree
                if i in self.out_of_bag[t]:
                    # Predict
                    predictions.append(self.decision_trees[t].predict(X[i][self.feature_indices[t]]))
            if len(predictions) > 0:
                # Majority voting
                accuracy.append(np.sum(predictions == y[i]) / float(len(predictions)))
        # Total accuracy
        return np.mean(accuracy)

# Evaluate the Random Forest

In [None]:
n_estimators = 3
max_depth = 7
max_features = 0.8
X_train = np.array(X_train)
X_test = np.array(X_test)
random_forest = RandomForest(n_estimators, max_depth, max_features)

random_forest.fit(X_train, y_train)
accuracy=random_forest.OOB_score(X_test, y_test)

print("accuracy: %.4f" % accuracy)

Current Tree to fit :  1
Current Tree to fit :  2
Current Tree to fit :  3
accuracy: 0.7943


In [None]:
X_train = df_train.iloc[:, :-1]
X_test = df_test.iloc[:, :-1]

In [None]:
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_train_encoded.drop(columns = "country_ Holand-Netherlands", inplace = True)

X_test_encoded = pd.get_dummies(X_test, drop_first=True)
clf = RandomForestClassifier()
clf.fit(X_train_encoded, y_train)
y_pred = clf.predict(X_test_encoded)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8459554081444629


# Random Forest Pros and Cons


| Pros | Cons
| --- | --- 
|Robust to outliers| biased while dealing with categorical variables.
|Works well with non-linear data.|Slow Training if the code isn't optimized.
|Lower risk of overfitting.|Not suitable for linear methods with a lot of sparse features
|Runs efficiently on a large dataset.|Greedy algorithms don’t yield the global optimum tree structure.

