# This Notebook...
...trains a number of models on the reviews.

# Dependencies

In [46]:
import pandas as pd
import numpy as np

import pickle

from sklearn.model_selection import train_test_split

# Model design
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall, F1Score, AUC

# Model saving
from keras.models import load_model

# # Timekeeping
import time
# print("Start of timekeeping.\n")
# start_time = time.time()

# Load Preprocessed Data

In [56]:
with open("data_tfidf.pkl", "rb") as f:
    X = pickle.load(f)

with open("data_y.pkl", "rb") as f:
    y = pickle.load(f)

# Split into train, valid, and test
X_train, X_valid = X["X_train"], X["X_valid"]
y_train, y_valid = y["y_train"], y["y_valid"]

assert X_train.shape[0] == y_train.shape[0]

((24897, 43543), (24897,))

# Global Model Settings

In [41]:
hist_dict = {}
global_metrics = [BinaryCrossentropy(name="CrossEntr."), BinaryAccuracy(name="Accuracy", threshold=0.5), Precision(name="Precision"), Recall(name="Recall"), F1Score(name="F1-Score"), AUC(name="ROC-AUC")]
global_metric_names = [metric.name for metric in global_metrics]

# Training parameters
n_epochs = 1
batch_size = 200
verbose = 0

# Model 1: NN with tfidf and meta data

In [None]:
# Load data
with open("data_tfidf.pkl", "rb") as f:
    X = pickle.load(f)

with open("data_y.pkl", "rb") as f:
    y = pickle.load(f)

# Split into train, valid, and test
X_train, X_valid = X["X_train"], X["X_valid"]
y_train, y_valid = y["y_train"], y["y_valid"]

assert X_train.shape[0] == y_train.shape[0]

In [42]:
name = "Model_1_tfidf"
start_time = time.time()

model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(1000, activation="relu"),
    Dense(500, activation="relu"),
    #Dense(200, activation="relu"),
    #Dense(100, activation="relu"),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer = "adam",
    loss = "binary_crossentropy",
    metrics = global_metrics# + ["accuracy"] #["accuracy"]
)

history = model.fit(X_train, y_train, epochs=n_epochs, verbose=verbose, batch_size=batch_size)
model.save("keras_models/" + name + ".keras")
print("Model " + name + " has been trained!")

print("Training time for " + name + ": " + str(time.time() - start_time) + " seconds\n")

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 35567
'y' sizes: 24897


# Model 2: NN with DistilBERT embeddings and meta

In [None]:
# Load DistilBERT predictors
with open("data_DistilBERT.pkl", "rb") as f:
    X = pickle.load(f)

# Split into train, valid, and test
X_train, X_valid = X["X_train"], X["X_valid"]

assert X_train.shape[0] == y_train.shape[0]

In [None]:
name = "Model_1_DistilBERT"
start_time = time.time()

model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(1000, activation="relu"),
    Dense(500, activation="relu"),
    #Dense(200, activation="relu"),
    #Dense(100, activation="relu"),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer = "adam",
    loss = "binary_crossentropy",
    metrics = global_metrics# + ["accuracy"] #["accuracy"]
)

history = model.fit(X_train, y_train, epochs=n_epochs, verbose=verbose, batch_size=batch_size)
model.save("keras_models/" + name + ".keras")
print("Model " + name + " has been trained!")

print("Training time for " + name + ": " + str(time.time() - start_time) + " seconds\n")

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 35567
'y' sizes: 24897
