In [2]:
from gensim.utils import simple_preprocess
import datautils
from gensim.models import KeyedVectors
from gensim.models import fasttext

In [3]:
PATH = './Dataset/power-gb-train.tsv'
RES_DIR = './Results/'
EMBED_DIR = './Embeddings/'

In [4]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

X_train, vectorizer = datautils.tf_idf_preprocessing(X_train)
X_val = vectorizer.transform(X_val)

In [5]:
#bayesian optimization of random forest based on fscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from bayes_opt import BayesianOptimization

def bo_params_rf(max_samples,n_estimators,max_features):
    
    params = {
        'max_samples': max_samples,
        'max_features':max_features,
        'n_estimators':int(n_estimators)
    }
    clf = RandomForestClassifier(max_samples=params['max_samples'],max_features=params['max_features'],n_estimators=params['n_estimators'])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    _,_,fscore,_ = precision_recall_fscore_support(y_val, y_pred, average='macro')
    return fscore

In [10]:
rf_bo = BayesianOptimization(
    bo_params_rf,
    {
        "max_samples": (0.5, 1),
        "n_estimators": (500, 3000),
        "max_features": (0.5, 1),
    },
)

rf_bo.set_gp_params(n_restarts_optimizer=5)
results = rf_bo.maximize(n_iter = 50)

|   iter    |  target   | max_fe... | max_sa... | n_esti... |
-------------------------------------------------------------


KeyboardInterrupt: 

In [15]:
#try to make a random forest
#word_vectors = KeyedVectors.load(EMBED_DIR+f'ftx-w10-s900.kv', mmap='r')

#X_train = datautils.documents_vector_wv(tr_fold,word_vectors)
#X_val = datautils.documents_vector_wv(val_fold,word_vectors)

In [27]:
# train a random forest
from sklearn.ensemble import RandomForestClassifier

# define hyperparameters ranges
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import ParameterGrid


hyperparameters = {
    "n_estimators": [500,700],
    "max_depth": [100,50],
    "min_samples_split": [2,5],
    "min_samples_leaf": [1,3],
}

param_grid = list(ParameterGrid(hyperparameters))

# initialize dataframe for results
import pandas as pd

results = pd.DataFrame(
    columns=[
        "n_estimators",
        "max_depth",
        "min_samples_split",
        "min_samples_leaf",
        "precision",
        "recall",
        "fscore",
        "p_train",
        "r_train",
        "f_train",
    ]
)

# train random forest
for par in param_grid:
    print(par)
    rf = RandomForestClassifier(
        n_estimators=par["n_estimators"],
        max_depth=par["max_depth"],
        min_samples_split=par["min_samples_split"],
        min_samples_leaf=par["min_samples_leaf"],
        n_jobs=-1,
    )

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)

    #compute metrics on training set
    p_train, r_train, f_train,_ = precision_recall_fscore_support(y_train, rf.predict(X_train), average="macro")

    precision, recall, fscore,_ = precision_recall_fscore_support(y_val, y_pred, average="macro")

    print(f"VAL fscore: {fscore:.4f}, TRain fscore: {f_train:.4f}")
    results = pd.concat(
        [
            results,
            pd.DataFrame(
                {
                    "n_estimators": [par["n_estimators"]],
                    "max_depth": [par["max_depth"]],
                    "min_samples_split": [par["min_samples_split"]],
                    "min_samples_leaf": [par["min_samples_leaf"]],
                    "precision": [precision],
                    "recall": [recall],
                    "fscore": [fscore],
                    "p_train": [p_train],
                    "r_train": [r_train],
                    "f_train": [f_train],
                }
            ),
        ]
    )

results.to_csv(RES_DIR + "ftx-w10-s900-rf-2.csv", index=False)



{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
VAL fscore: 0.7097, TRain fscore: 1.0000
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 700}


  results = pd.concat(


VAL fscore: 0.7087, TRain fscore: 1.0000
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
VAL fscore: 0.7105, TRain fscore: 1.0000
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 700}
VAL fscore: 0.7109, TRain fscore: 1.0000


In [26]:
import pandas as pd

results = pd.read_csv(RES_DIR + "ftx-w10-s900-rf-3.csv")
results.sort_values("fscore", ascending=False).head(10)

Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,precision,recall,fscore,p_train,r_train,f_train
3,1000,,5,1,0.770287,0.712062,0.713635,0.999963,0.999952,0.999958
1,2000,,2,1,0.767926,0.710646,0.712138,1.0,1.0,1.0
4,2000,,5,1,0.766737,0.710058,0.711531,0.999963,0.999952,0.999958
0,1000,,2,1,0.765761,0.708944,0.710294,1.0,1.0,1.0
5,5000,,5,1,0.765489,0.708558,0.709859,0.999963,0.999952,0.999958
2,5000,,2,1,0.763209,0.706423,0.707513,1.0,1.0,1.0
23,5000,50.0,5,1,0.765965,0.701073,0.700901,0.995658,0.994368,0.994987
20,5000,50.0,2,1,0.765745,0.700447,0.700162,0.997746,0.997088,0.99741
21,1000,50.0,5,1,0.765246,0.700346,0.700084,0.995293,0.99389,0.994562
22,2000,50.0,5,1,0.765112,0.700153,0.699862,0.995621,0.99432,0.994945


In [17]:
precision, recall, fscore,_ = precision_recall_fscore_support(y_val, y_pred, average="macro")