# Simple NN with pooling

In [8]:
from sklearn.metrics import f1_score
import pandas as pd
import datautils
import simpleNN
import torch
from sklearn.model_selection import ParameterGrid
import time

In [9]:
PATH = './Dataset/power-gb-train.tsv'
RES_DIR = './Results/'
EMBED_DIR = './Embeddings/'

## Tf-idf embeddings

In [10]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_kfold_dataset(PATH)

for idx, (tr_fold, val_fold) in enumerate(zip(X_train, X_val)):

    X_train[idx], vectorizer = datautils.tf_idf_preprocessing(tr_fold)
    X_val[idx] = vectorizer.transform(val_fold)

In [7]:



grid = list(ParameterGrid(
    {
        "hidden_neurons": [20], 
        "lr": [1e-3, 1e-2, 1e-1],
        "epochs": [10,20],
        "batch size": [1,5,10]
    }
))
# train simple NN here
results_df = pd.DataFrame(
    columns=["hidden neurons","optimizer","batch size" ,"epochs", "F1 Score", "Recall", "Precision"]
)
print(f"Training a simple NN")
for params in grid:
    hidden_neurons = params["hidden_neurons"]
    lr = params["lr"]
    epochs = params["epochs"]
    batch_size = params["batch size"]
    print(f"hyperparams: hidden neurons={hidden_neurons} lr={lr} epochs={epochs} batch size={1}:")
    for k, (fold_X_train,fold_y_train,fold_X_val,fold_y_val) in enumerate(zip(X_train, y_train,X_val,y_val)):
        print("Fold: ", k+1)
        simple_nn=simpleNN.SimpleNN(fold_X_train.shape[1],hidden_neurons)
        start = time.time()
        fold_precision,fold_recall,fold_fscore=simpleNN.train_simpleNN(
                    fold_X_train,fold_X_val, 
                    fold_y_train,fold_y_val,
                    loss_fn=torch.nn.BCELoss(),
                    model = simple_nn, 
                    optimizer=torch.optim.SGD(simple_nn.parameters(),lr=lr),
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=True
                    )
        stop = time.time()
        print(f"precision={fold_precision},recall={fold_recall},score={fold_fscore}")
        results_df = pd.concat(
            [
                results_df,
                pd.DataFrame(
                    {
                        "batch size": batch_size,
                        "hidden neurons": params["hidden_neurons"],
                        "learning rate": params["lr"],
                        "optimizer": "SGD",
                        "epochs": params["epochs"],
                        "Fold": k,
                        "Loss":"BCELoss",
                        "F1 Score": fold_fscore,
                        "Recall": fold_recall,
                        "Precision": fold_precision,
                        "Time": stop - start,

                    },
                    index=[0],
                ),
            ],
            ignore_index=True,
        )
    # save results every time we finish a fold in case of crash
    results_df.to_csv(RES_DIR + "results-SimpleNN-tfidf.csv", index=False)
results_df

Training a simple NN
hyperparams: hidden neurons=20 lr=0.001 epochs=10 batch size=1:
Fold:  1


NameError: name 'time' is not defined

## Word2vec embeddings

In [None]:
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess

In [None]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_kfold_dataset(PATH)

for idx, (tr_fold, val_fold) in enumerate(zip(X_train, X_val)):
    word_vectors = KeyedVectors.load(EMBED_DIR+f'w2v-fold{idx}.kv', mmap='r')

    tr_fold = list(map(simple_preprocess, tr_fold))
    val_fold = list(map(simple_preprocess, val_fold))

    X_train[idx] = datautils.documents_vector_wv(tr_fold,word_vectors)
    X_val[idx] = datautils.documents_vector_wv(val_fold,word_vectors)

In [None]:
# train simple NN here

## Fasttext embeddings

In [None]:
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess

In [None]:
X_train, y_train, X_val, y_val, _, _ = datautils.split_kfold_dataset(PATH)

for idx, (tr_fold, val_fold) in enumerate(zip(X_train, X_val)):
    word_vectors = KeyedVectors.load(EMBED_DIR+f'w2v-fold{idx}.kv', mmap='r')

    tr_fold = list(map(simple_preprocess, tr_fold))
    val_fold = list(map(simple_preprocess, val_fold))

    X_train[idx] = datautils.documents_vector_wv(tr_fold,word_vectors)
    X_val[idx] = datautils.documents_vector_wv(val_fold,word_vectors)

In [None]:
#train simple NN here
