# Final Ensemble Learner

## What's happening here?

You'll see that the first three sections creates learners.   
This does not mean that the learners created here are the ones that go in the ensemble. I believe our Random Forest is the only one which is created in advance

## Random Forest made up of DTrees (learner #1)

In [18]:
import mltools as ml
import numpy as np
import matplotlib.pyplot as plt

X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')

X,Y = ml.shuffleData(X,Y)

Xtr, Xva, Ytr, Yva = ml.splitData(X,Y,0.7)

X_numeric = Xtr[:,:41]
X_discrete = Xtr[:,41:69]
X_binary = Xtr[:,69:-1]

Xtr_kaggle = np.genfromtxt('data/X_train.txt', delimiter=',')
Ytr_kaggle = np.genfromtxt('data/Y_train.txt', delimiter=',')

In [19]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

bootstrap_sample_size = 50

m,n = Xtr.shape

bag_numbers = np.array([5,10,25,50])

In [20]:


#Xtr_kaggle = Xtr_kaggle[:,:41]

Xtr_kaggle,Ytr_kaggle = ml.shuffleData(Xtr_kaggle,Ytr_kaggle)

#final_num_bags = 50
#final_classifiers = [None]*final_num_bags

num_bags = 50

num_numeric_learners = int(round(num_bags / 1.5))
num_discrete_learners = int(round(num_bags / 5))
num_binary_learners = int(round(num_bags / 10))
    
# update num bags based on number of learners 
# (only slight deviation based on the algorithm and rounding)
num_bags = num_numeric_learners + num_discrete_learners + num_binary_learners

print(f"# of numeric learners = {num_numeric_learners}")
print(f"# of discrete learners = {num_discrete_learners}")
print(f"# of binary learners = {num_binary_learners}")
print(f"---- Total number of bags for this run: {num_bags}")


classifiers = [None]*num_bags

# keep track of which number classifier we are on
classifiers_index = 0

for i in range(num_numeric_learners):
    #print("classifier index", classifiers_index)
    Xi,Yi = ml.bootstrapData(Xtr, Ytr)

    # insert classifier into list
    classifiers[classifiers_index] = ml.dtree.treeClassify(Xi, Yi, minParent=400, minLeaf=100, maxDepth=50)
    classifiers_index += 1

for i in range(num_discrete_learners):
    #print("classifier index", classifiers_index)
    Xi,Yi = ml.bootstrapData(Xtr, Ytr)

    # insert classifier into list
    classifiers[classifiers_index] = ml.dtree.treeClassify(Xi, Yi, minParent=300, minLeaf=100, maxDepth=10)
    classifiers_index += 1

for i in range(num_binary_learners):
    #print("classifier index", classifiers_index)
    Xi,Yi = ml.bootstrapData(Xtr, Ytr)

    # insert classifier into list
    classifiers[classifiers_index] = ml.dtree.treeClassify(Xi, Yi, minParent=16, minLeaf=50, maxDepth=10)
    classifiers_index += 1


    
    

kaggle_bagged_tree = BaggedTree(classifiers)
kaggle_bagged_tree.classes = np.unique(Ytr_kaggle)



# of numeric learners = 33
# of discrete learners = 10
# of binary learners = 5
---- Total number of bags for this run: 48


## KNN Learner (Learner #2)

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

import mltools as ml
import numpy as np


X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')

np.random.seed(0)
X,Y = ml.shuffleData(X,Y)
X_numeric = X[:,:41]
X_categorical = X[:,41:69]
scaler = StandardScaler()


Xtr, Xva, Ytr, Yva = ml.splitData(X_categorical,Y,0.7)
Xtr_scaled = scaler.fit_transform(Xtr, Ytr)
Xva_scaled = scaler.fit_transform(Xva,Yva)


In [65]:
sklearn_knn = KNeighborsClassifier(n_neighbors=410, weights='distance')
sklearn_knn.fit(Xtr_scaled, Ytr)

KNeighborsClassifier(n_neighbors=410, weights='distance')

## Neural Network Learner (Learner #3)

In [58]:
from sklearn.neural_network import MLPClassifier as mlpc
from sklearn.preprocessing import StandardScaler,QuantileTransformer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, roc_auc_score

import mltools as ml
X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
np.random.seed(0)

X,Y = ml.shuffleData(X,Y)
X = X[:,:41]

In [24]:
scaler = StandardScaler()
transformer = QuantileTransformer()

Xtr, Xva, Ytr, Yva = ml.splitData(X,Y,0.7)
Xtr_scaled = scaler.fit_transform(Xtr, Ytr)
Xva_scaled = scaler.fit_transform(Xva,Yva)
Xtr_transformed = transformer.fit_transform(Xtr)
Xva_transformed = transformer.fit_transform(Xva)

In [25]:
learner = mlpc(max_iter=10000, alpha=0.05, hidden_layer_sizes=(5,5,10), activation='tanh', learning_rate= 'adaptive')
learner.fit(Xtr_transformed,Ytr) #the training

MLPClassifier(activation='tanh', alpha=0.05, hidden_layer_sizes=(5, 5, 10),
              learning_rate='adaptive', max_iter=10000)

In [26]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
Xte_transformed = transformer.fit_transform(Xte[:,:41])
Yte_hat = np.vstack((np.arange(Xte_transformed.shape[0]), learner.predict(Xte_transformed))).T

np.savetxt('Y_submit_neural_network.txt', Yte_hat,'%d, %.2f',comments='', header='Id,Predicted', delimiter=',')

## FINAL Ensemble

### Define Ensemble class
In this case, we are using the BaggedTree class to implement a Random Forest

In [66]:
from sklearn.base import ClassifierMixin


class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = []
        for l in range(n_bags):
            prediction = 0
            learner = self.learners[l]
            
            if isinstance(learner,ClassifierMixin):
                prediction = learner.predict_proba(X)
            else:
                prediction = learner.predictSoft(X)
                
            preds.append(prediction)
        return np.mean(preds, axis=0)

### Get the data

In [67]:
import mltools as ml
import numpy as np
import matplotlib.pyplot as plt

X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')

X,Y = ml.shuffleData(X,Y)

Xtr, Xva, Ytr, Yva = ml.splitData(X,Y,0.7)


### Preprocess data

I'm giving us options to use three types of data:  
- Raw (`Xtr`/`Xva`)  
- Scaled (`Xtr_scaled`,`Xva_scaled`)  
- Transformed (`Xtr_transformed`/`Xva_transformed`)

**KNOW THIS:** Whatever preprocessing actions we do to the training data MUST BE DONE ON THE FINAL `Xte` ON WHICH OUR FINAL ENSEMBLE MAKES PREDICTIONS FOR KAGGLE SUBMISSION

In [68]:
scaler = StandardScaler()
transformer = QuantileTransformer()

Xtr, Xva, Ytr, Yva = ml.splitData(X,Y,0.7)
Xtr_scaled = scaler.fit_transform(Xtr, Ytr)
Xva_scaled = scaler.fit_transform(Xva,Yva)
Xtr_transformed = transformer.fit_transform(Xtr)
Xva_transformed = transformer.fit_transform(Xva)



### Create the list of classifiers for our Final Ensemble

In [69]:
bootstrap_sample_size = 50


num_bags = 50

num_random_forest_learners = int(round(num_bags / 1.5))
num_knn_learners = int(round(num_bags / 10))
num_neural_network_learners = int(round(num_bags / 5))
    
# update num bags based on number of learners 
# (only slight deviation based on the algorithm and rounding)
num_bags = num_random_forest_learners + num_knn_learners + num_neural_network_learners

print(f"# of numeric learners = {num_random_forest_learners}")
print(f"# of discrete learners = {num_knn_learners}")
print(f"# of binary learners = {num_neural_network_learners}")
print(f"---- Total number of bags for this run: {num_bags}")


final_classifiers = [None]*num_bags

# keep track of which number classifier we are on
classifiers_index = 0


# Populate classifier list with our random forest learners
for i in range(num_random_forest_learners):
    Xi,Yi = ml.bootstrapData(Xtr, Ytr)

    # insert classifier into list
    
    # WE CAN'T TRAIN OUR RANDOM FOREST AGAIN
    final_classifiers[classifiers_index] = kaggle_bagged_tree
    classifiers_index += 1

for i in range(num_knn_learners):
    Xi,Yi = ml.bootstrapData(Xtr, Ytr)

    # insert classifier into list
    knn = KNeighborsClassifier(n_neighbors=410, weights='distance')
    knn.fit(Xi, Yi)
    final_classifiers[classifiers_index] = knn
    classifiers_index += 1

for i in range(num_neural_network_learners):
    Xi,Yi = ml.bootstrapData(Xtr, Ytr)

    # insert classifier into list
    neural_network_learner = mlpc(max_iter=10000, alpha=0.05, hidden_layer_sizes=(5,5,10), activation='tanh', learning_rate= 'adaptive')
    neural_network_learner.fit(Xi,Yi) #the training
    final_classifiers[classifiers_index] = neural_network_learner
    classifiers_index += 1
    



    
    

final_ensemble = BaggedTree(final_classifiers)
final_ensemble.classes = np.unique(Ytr)



# of numeric learners = 33
# of discrete learners = 5
# of binary learners = 10
---- Total number of bags for this run: 48


In [70]:
auc = final_ensemble.auc(Xva, Yva)

print("auc", auc)

auc 0.7862609338266497
