In [11]:
import pandas as pd
import numpy as np
import splearn.Metrics as spmet
import sklearn.metrics as skmet

from splearn.EnsembleLearning.AdaBooster import AdaBooster
from splearn.EnsembleLearning.TreeBagger import TreeBagger
from splearn.EnsembleLearning.RandomForest import RandomForest
from splearn.DecisionTree.DecisionTree import DecisionTree

In [7]:
train_df = pd.read_csv("data/bank/train.csv")
test_df = pd.read_csv("data/bank/test.csv")

X_train = train_df[train_df.columns[:-1]]
y_train = train_df[train_df.columns[-1]]

X_test = test_df[test_df.columns[:-1]]
y_test = test_df[test_df.columns[-1]]

train_df.head()

Unnamed: 0,age,job,marital,edu,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,unknown,5,may,114,2,-1,0,unknown,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,unknown,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,unknown,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,unknown,yes


In [34]:
# Prime all models for training at iters = 0

ab = AdaBooster(DecisionTree)
tb = TreeBagger()
rf1 = RandomForest()
rf2 = RandomForest()
rf3 = RandomForest()

ab.train(
    X_train,
    y_train,
    iterations = 0,
    learner_args = {
        "gain": "gini",
        "max_depth": 2
    }
)

tb.train(
    X_train,
    y_train,
    0,
    seed = 42,
    gain = "gini"
)

rf1.train(
    X_train,
    y_train,
    num_trees = 0,
    feature_frac = 3.0 / len(X_train.columns),
    seed = 420,
    gain = "gini"
)

rf2.train(
    X_train,
    y_train,
    num_trees = 0,
    feature_frac = 5.0 / len(X_train.columns),
    seed = 420,
    gain = "gini"
)

rf3.train(
    X_train,
    y_train,
    num_trees = 0,
    feature_frac = 7.0 / len(X_train.columns),
    seed = 420,
    gain = "gini"
)

models = [
    #ab, 
    #tb,
    rf1,
    rf2, 
    rf3
]

num_obs = 9
points = [2 ** i for i in range(num_obs)]

data = []

for i in points:

    while len(models[0]) < i:
        [m.iterate(1) for m in models]
        print(f"Iteration {len(models[1])} complete!")

    p = []
    p.append(i)
    
    for m in models:
        p_train = m.predict(X_train)
        p_test  = m.predict(X_test)
        p.append(skmet.accuracy_score(y_train, p_train))
        p.append(skmet.accuracy_score(y_test, p_test))

    data.append(p)
        

Iteration 1 complete!
Iteration 2 complete!
Iteration 3 complete!
Iteration 4 complete!
Iteration 5 complete!
Iteration 6 complete!
Iteration 7 complete!
Iteration 8 complete!
Iteration 9 complete!
Iteration 10 complete!
Iteration 11 complete!
Iteration 12 complete!
Iteration 13 complete!
Iteration 14 complete!
Iteration 15 complete!
Iteration 16 complete!
Iteration 17 complete!
Iteration 18 complete!
Iteration 19 complete!
Iteration 20 complete!
Iteration 21 complete!
Iteration 22 complete!
Iteration 23 complete!
Iteration 24 complete!
Iteration 25 complete!
Iteration 26 complete!
Iteration 27 complete!
Iteration 28 complete!
Iteration 29 complete!
Iteration 30 complete!
Iteration 31 complete!
Iteration 32 complete!
Iteration 33 complete!
Iteration 34 complete!
Iteration 35 complete!
Iteration 36 complete!
Iteration 37 complete!
Iteration 38 complete!
Iteration 39 complete!
Iteration 40 complete!
Iteration 41 complete!
Iteration 42 complete!
Iteration 43 complete!
Iteration 44 complet

In [29]:
data = np.array(data)

columns = [
    "num_trees",
    "rf2_train",
    "rf2_test",
    "rf4_train",
    "rf4_test",
    "rf6_train",
    "rf6_test",
]

df = pd.DataFrame(data, columns = columns)
df = df.set_index(columns[0])
df

Unnamed: 0_level_0,rf2_train,rf2_test,rf4_train,rf4_test,rf6_train,rf6_test
num_trees,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,0.8808,0.8752,0.8786,0.871,0.885,0.868
2.0,0.8908,0.8828,0.8844,0.8748,0.8866,0.8714
4.0,0.8808,0.8752,0.8868,0.8802,0.908,0.8792
8.0,0.8808,0.8752,0.8814,0.8768,0.9022,0.8842
16.0,0.8808,0.8752,0.8812,0.8754,0.8968,0.8802
32.0,0.8808,0.8752,0.8818,0.8754,0.896,0.8814
64.0,0.8808,0.8752,0.8822,0.876,0.897,0.8806
128.0,0.8808,0.8752,0.882,0.8756,0.8964,0.8792


In [33]:
train_unique, train_counts = np.unique(y_train, return_counts=True)
test_unique,  test_counts  = np.unique(y_test,  return_counts=True)

print(train_unique, train_counts)
print(test_unique,  test_counts )

print(train_counts[0] / sum(train_counts))
print(test_counts[0] / sum(test_counts))

['no' 'yes'] [4404  596]
['no' 'yes'] [4376  624]
0.8808
0.8752


In [None]:
print(len(X_train.columns))