In [1]:
import os

import torch
from sklearn.model_selection import KFold
from torch import optim, nn
from torch.optim import Optimizer
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm  # TODO: niet meer notebook als dat niet nodig is

from src.predictor.multilayer_perceptron import MultiLayerPerceptronPredictor
from src.data.data_preparer import DataPreparer
from src.data.data_reader import DataReader
from src.tools.RestaurantReviewsDataset import RestaurantReviewsDataset

while str(os.getcwd())[-3:] != 'src':  # Execute from src-directory root
    os.chdir('..')

In [3]:
businesses, reviews, tips = DataReader().read_data()
input_ml_train, input_ml_test, output_ml_train, output_ml_test = DataPreparer.get_train_test_validate(businesses, reviews, tips)

In [4]:
def train_epoch(model: nn.Module, dataloader: DataLoader, loss_fn, optimizer: Optimizer) -> float:
    model.train()  # Prepare layers of model for training
    # Prepare statistics
    total_loss = 0
    num_batches = len(dataloader)
    for restaurant_reviews, ratings in tqdm(dataloader, desc=f"Training network in batches", leave=None):
        # Prepare data
        restaurant_reviews, ratings = DataPreparer.get_tensor_for_ml(restaurant_reviews, ratings)
        # Compute predictions and loss
        predictions = model(restaurant_reviews)
        loss = loss_fn(predictions, ratings)
        # Compute statistics
        total_loss += loss.item()
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    mean_loss = total_loss / num_batches
    return mean_loss


def validate_epoch(model: nn.Module, dataloader: DataLoader, loss_fn) -> tuple[float, float]:
    model.eval()  # Prepare layers of model for evaluation
    # Prepare statistics
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for restaurant_reviews, ratings in dataloader:
            # Prepare data
            restaurant_reviews, ratings = DataPreparer.get_tensor_for_ml(restaurant_reviews, ratings)
            # Compute predictions and loss
            predictions = model(restaurant_reviews)
            loss = loss_fn(predictions, ratings)
            # Calculate statistics
            total_loss += loss.item()
            correct += ((ratings - 0.125 <= predictions) & (predictions <= ratings + 0.125)).type(torch.float).sum().item()

    mean_loss = total_loss / num_batches
    accuracy = correct / size
    return mean_loss, accuracy

In [5]:
# splits = KFold(n_splits=10, shuffle=True)
# history = {'train_loss': [], 'test_loss': [],'test_acc':[]}
# batch_size = 1024
# epochs = 10
# criterion = nn.MSELoss()
#
# for i, (train_idx_fold, val_idx_fold) in tqdm(enumerate(splits.split(input_ml_train, output_ml_train)), desc="K-Fold Cross Validation"):
#
#     train_data_fold = RestaurantReviewsDataset(input_ml_train.iloc[train_idx_fold].to_numpy(), output_ml_train.iloc[train_idx_fold].to_numpy())
#     validate_data_fold = RestaurantReviewsDataset(input_ml_train.iloc[val_idx_fold].to_numpy(), output_ml_train.iloc[val_idx_fold].to_numpy())
#
#     train_loader = DataLoader(train_data_fold, batch_size=batch_size)
#     val_loader = DataLoader(validate_data_fold, batch_size=batch_size)
#
#     model = MultiLayerPerceptronPredictor(input_size=input_ml_train.columns.size, output_size=1)
#     optimizer = optim.Adam(model.parameters(), lr=0.002)
#
#     for epoch in tqdm(range(epochs), desc="Epochs"):  # TODO: uitzoeken hoe het zit met epochs en kfolds, en in welke volgorde ik die run, ALS die gecombineerd mogen zelfs, prob niet lijkt me nu
#         train_loss = train_epoch(model, train_loader, criterion, optimizer)
#         test_loss, test_acc = validate_epoch(model, val_loader, criterion)
#
#         print(f"Epoch:{epoch + 1}/{epochs} AVG Training Loss:{train_loss:.3f} AVG Test Loss:{test_loss:.3f} AVG Test Acc {test_acc * 100:.2f} %")
#         history['train_loss'].append(train_loss)
#         history['test_loss'].append(test_loss)
#         history['test_acc'].append(test_acc)

In [6]:
history = {'train_loss': [], 'test_loss': [],'test_acc':[]}
batch_size = 1024
epochs = 50
criterion = nn.MSELoss()

train_data = RestaurantReviewsDataset(input_ml_train.to_numpy(), output_ml_train.to_numpy())
validate_data = RestaurantReviewsDataset(input_ml_train.to_numpy(), output_ml_train.to_numpy())

train_loader = DataLoader(train_data, batch_size=batch_size)
val_loader = DataLoader(validate_data, batch_size=batch_size)

model = MultiLayerPerceptronPredictor(input_size=input_ml_train.columns.size, output_size=1)
optimizer = optim.Adam(model.parameters(), lr=0.002)

epochs_with_progressbar = tqdm(range(epochs), desc="Epochs")
for epoch in epochs_with_progressbar:
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    test_loss, test_acc = validate_epoch(model, val_loader, criterion)

    # print(f"Epoch:{epoch + 1}/{epochs} AVG Training Loss:{train_loss:.3f} AVG Test Loss:{test_loss:.3f} AVG Test Acc {test_acc * 100:.2f} %")
    history['train_loss'].append(train_loss)
    history['test_loss'].append(test_loss)
    history['test_acc'].append(test_acc)
    epochs_with_progressbar.set_description_str(f"Epochs (loss of last 5 epochs: {history['test_loss'][-5:]}")

Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

Training network in batches:   0%|          | 0/3697 [00:00<?, ?it/s]

Training network in batches:   0%|          | 0/3697 [00:00<?, ?it/s]

Training network in batches:   0%|          | 0/3697 [00:00<?, ?it/s]

KeyboardInterrupt: 