## Test tf-idf embeddings

In [None]:
import datautils
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score



PATH = './Dataset/power-gb-train.tsv'
RES_DIR = './Results/'

In this cell we split the data set and we vectorize the training set, validation set and test set

In [None]:
#split of the dataset in training validation and test
X_train, y_train, X_val, y_val, X_test,y_test = datautils.split_holdout_dataset(PATH)
#vectorization of the three sets that we obtained
X_train, vectorizer = datautils.tf_idf_preprocessing(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In the cell below we procede with the grid search over the hyperparameters of the logistic regression

In [None]:
#definition of the hyperparameters of the logistic regression
hyperparameters = {
    "penalty": ["l2"],
    "C": [0.1, 1.0, 10.0, 100.0, 1000.0, 1500],
    "solver": ["lbfgs"],
    "max_iter": [100, 200, 500, 700 ],
}

param_grid = list(ParameterGrid(hyperparameters))

results_df = pd.DataFrame(
    columns=["penalty", "C", "solver", "max_iter", "F1 Score", 'Precision','Recall']
)

for par in param_grid:
   
   #we create the model and train it on the training set
    model = LogisticRegression(**par)
    model.fit(X_train, y_train)
    

    # Compute F1 score, Precision and Recall on validation set
    y_val_pred = model.predict(X_val)
    f1_macro = f1_score(y_val, y_val_pred, average="macro")
    recall= recall_score(y_val,y_val_pred)
    precision=precision_score(y_val,y_val_pred)

    print(f"Parameters: {par}")
    print(f"\tF1 score: {f1_macro}")
    #save them into a dataframe
    results_df = pd.concat(
        [
            results_df,
            pd.DataFrame(
                {
                    "penalty": par["penalty"],
                    "C": par["C"],
                    "solver": par["solver"],
                    "max_iter": par["max_iter"],
                    "F1 Score": f1_macro,
                    'Precision':precision,
                    'Recall':recall
                },
                index=[0],
            ),
        ],
        ignore_index=True,
    )
results_df.to_csv(RES_DIR+"results-Logistic-tfidf1.csv", index=False)

In the last cell we take the best combination of the hyperparameters and compute the values of the f1 score, recall and precision over the test set

In [None]:

#creation of the model and fitting with the training data
model = LogisticRegression(C=1.0,max_iter=200,solver='lbfgs',penalty='l2')
model.fit(X_train, y_train)


# Compute F1 score, Precision and Recall on the test set
y_test_pred = model.predict(X_test)
f1_macro = f1_score(y_test, y_test_pred, average="macro")
recall= recall_score(y_test,y_test_pred)
precision=precision_score(y_test,y_test_pred)

#print of the score that we compute above
print(f"\tF1 score: {f1_macro}")
print(f"\tRecall: {recall}")
print(f"\tPrecision: {precision}")