In [40]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import backend.config as config
import dataprocessing
import surfacemodelclass as sf
import util

### Data preparation
extract current data from the database and prepare for processing

In [41]:
from requests.exceptions import ChunkedEncodingError

CLASSES = config.classes

raw_data = ""
while raw_data == "":
    try:
        raw_data = dataprocessing.get_data_db()
    except ChunkedEncodingError:
        print("Couldn't get Data, retrying ...")

df = pd.read_json(raw_data)

  df = pd.read_json(raw_data)


In [42]:
def pre_processing(dataframe):
    # cycling session 06.11.2023
    pavement_start = pd.Timestamp(year=2023, month=11, day=6, hour=18, minute=33)
    pavement_end = pd.Timestamp(year=2023, month=11, day=6, hour=18, minute=55)
    asphalt_start_1 = pd.Timestamp(year=2023, month=11, day=6, hour=19, minute=9)
    asphalt_end_1 = pd.Timestamp(year=2023, month=11, day=6, hour=19, minute=17)
    asphalt_start_2 = pd.Timestamp(year=2023, month=11, day=6, hour=19, minute=31)
    asphalt_end_2 = pd.Timestamp(year=2023, month=11, day=6, hour=20, minute=00)

    #cycling session 09.11.2023
    asphalt_start_3 = pd.Timestamp(year=2023, month=11, day=9, hour=20, minute=20)
    asphalt_end_3 = pd.Timestamp(year=2023, month=11, day=9, hour=21, minute=0)
    pavement_start_2 = pd.Timestamp(year=2023, month=11, day=9, hour=21, minute=5)
    pavement_end_2 = pd.Timestamp(year=2023, month=11, day=9, hour=21, minute=30)

    #cycling session 13.11.2023
    grass_start = pd.Timestamp(year=2023, month=11, day=13, hour=20, minute=00)
    grass_end = pd.Timestamp(year=2023, month=11, day=13, hour=20, minute=20)

    asphalt_count = 0
    pavement_count = 0
    grass_count = 0

    dataframe['time'] = pd.to_datetime(dataframe['time'], format='mixed')
    for i, row in dataframe.iterrows():
        if pavement_start <= row.time <= pavement_end or pavement_start_2 <= row.time <= pavement_end_2:
            dataframe.at[i, 'terrain'] = config.map_to_int('pavement')
            pavement_count += 1
        elif asphalt_start_1 <= row.time <= asphalt_end_1 or asphalt_start_2 <= row.time <= asphalt_end_2 or asphalt_start_3 <= row.time <= asphalt_end_3:
            dataframe.at[i, 'terrain'] = config.map_to_int('asphalt')
            asphalt_count += 1
        elif grass_start <= row.time <= grass_end:
            dataframe.at[i, 'terrain'] = config.map_to_int('grass')
            grass_count += 1

    print("Asphalt Data points: ", asphalt_count)
    print("Pavement Data points: ", pavement_count)
    print("Grass Data points: ", grass_count)

    return dataframe


df = pre_processing(df)

Asphalt Data points:  305
Pavement Data points:  752
Grass Data points:  81


In [43]:
BATCH_SIZE = 10
n_cols = 8


def data_preparation(df):
    global n_cols

    df.dropna(subset=['terrain'], inplace=True)

    df['time_second'] = df.time.map(lambda x: pd.Timestamp(x).floor(freq='S'))
    df['time'] = df.time.map(pd.Timestamp.timestamp)

    grouped = df.groupby([df.trip_id, df.time_second])  # grouped.get_group(1)
    x = []
    y = []

    # data verification
    data_errors = ['GOOD', 'LONG', 'SHORT']
    data_checking = [0, 0, 0]
    total_samples = 0
    total_length = 0

    for i, (trip_seconds, table) in enumerate(grouped):
        if (i + 1) % 100 == 0:
            print("# trip seconds: " + str(i + 1))

        train_input = table.drop(columns=['terrain', 'trip_id', 'crash', 'time_second', 'latitude', 'longitude'])
        n_cols = len(train_input.columns)

        train_input = train_input.to_numpy()

        input_length = len(train_input)
        total_length += input_length
        if input_length == BATCH_SIZE:
            data_checking[0] += 1
        elif input_length > BATCH_SIZE:
            data_checking[1] += 1
            train_input = train_input[:BATCH_SIZE]
        else:
            data_checking[2] += 1
            n_missing_rows = BATCH_SIZE - len(train_input)
            for _ in range(n_missing_rows):
                fake_array = [1] * n_cols
                train_input = np.append(train_input, fake_array)

        train_target = table.terrain.min()

        x.append(train_input)
        y.append(train_target)
        total_samples += 1

    print('Printing Data Accuracy to ', BATCH_SIZE, ' Hz frequency ...')
    for i in range(len(data_checking)):
        if total_samples > 0:
            print('%5s: %2d%% (%2d/%2d)' % (
                data_errors[i], 100.0 * data_checking[i] / total_samples,
                np.sum(data_checking[i]), total_samples))
        else:
            raise Exception("No Data Samples found, please check db connection")

    print('Mean Batch Length: %2.2f per Tripsecond ' % (total_length / total_samples))

    return np.array(x), np.array(y)


X, Y = data_preparation(df)

# trip seconds: 100
# trip seconds: 200
# trip seconds: 300
Printing Data Accuracy to  10  Hz frequency ...
 GOOD:  0% ( 0/332)
 LONG:  0% ( 0/332)
SHORT: 100% (332/332)
Mean Batch Length: 3.43 per Tripsecond 


### Training the Model

In [44]:
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader


def gen_dataloader(x, y, test_size=0.2, random_state=0):
    scaler = MinMaxScaler()  # TODO: choose scaler
    scaler.fit(x)
    x = scaler.transform(x)

    x = torch.tensor(x)
    y = torch.tensor(y)

    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=test_size, random_state=random_state)

    train_x = torch.Tensor(train_x)
    train_y = F.one_hot(train_y.long(), len(CLASSES)).to(torch.float32)

    test_x = torch.Tensor(test_x)
    test_y = F.one_hot(test_y.long(), len(CLASSES)).to(torch.float32)

    train_dataset = TensorDataset(train_x, train_y)
    test_dataset = TensorDataset(test_x, test_y)

    train_loader = DataLoader(train_dataset)
    test_loader = DataLoader(test_dataset)

    return train_loader, test_loader


train_loader, test_loader = gen_dataloader(X, Y)  # with default values

In [45]:
from torch import optim

LR = 1e-3
MOMENTUM = 0.9
NUM_EPOCHS = 30
HIDDEN_LAYERS = 64  # TODO maybe increase with more data


def train_model(model, train_loader, num_epochs=NUM_EPOCHS, lr=LR, momentum=MOMENTUM, device=None):
    global n_cols
    criterion = nn.CrossEntropyLoss()
    # criterion = nn.MSELoss()
    # criterion = nn.NLLLoss()

    # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print("Training STARTED")

    for e in range(0, num_epochs):
        model.train()  # set the model in training mode
        total_train_loss = 0  # initialize the total training and validation loss

        for i, (training_input, target) in enumerate(train_loader):  # loop over the training set
            hidden = model.initHidden()
            model.zero_grad()

            output = [.0, .0, .0, .0]
            for data_row in training_input:
                for i in range(0, len(data_row), n_cols):
                    model_input = data_row[None, i:i + n_cols].float()
                    output, hidden = model(model_input, hidden)

            optimizer.zero_grad()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            # add the loss to the total training loss so far and calculate the number of correct predictions
            total_train_loss += loss.item()

        if (e + 1) % 10 == 0:
            print("Epoch", e + 1, "Training Loss:", total_train_loss)

    print("Training FINISHED")

    return model, criterion


model = sf.RNN(n_cols, HIDDEN_LAYERS, len(CLASSES))
model, criterion = train_model(model=model, train_loader=train_loader)


Training STARTED
Epoch 10 Training Loss: 158.58728707518458
Epoch 20 Training Loss: 150.1373824167531
Epoch 30 Training Loss: 138.99942346639
Training FINISHED


## Testing

In [46]:
def print_training_accuracy(model, train_loader, criterion, classes):
    training_loss, class_correct, class_total = util.compute_accuracy(model=model, loader=train_loader,
                                                                      criterion=criterion)

    # average training loss
    training_loss = training_loss / len(train_loader.dataset)
    print('Training Loss: {:.6f}\n'.format(training_loss))
    for i in range(len(classes)):
        if class_total[i] > 0:
            print('Training Accuracy of %5s: %2d%% (%2d/%2d)' % (
                classes[i], 100.0 * class_correct[i] / class_total[i],
                np.sum(class_correct[i]), np.sum(class_total[i])))
        else:
            print('Training Accuracy of %5s: N/A ' % (classes[i]))

    print('Training Accuracy (Overall): %2d%% (%2d/%2d)' % (
        100. * np.sum(class_correct) / np.sum(class_total),
        np.sum(class_correct), np.sum(class_total)))


print_training_accuracy(model=model, train_loader=train_loader, criterion=criterion, classes=CLASSES)

Training Loss: 0.513656

Training Accuracy of asphalt: 41% (31/74)
Training Accuracy of pavement: 94% (162/171)
Training Accuracy of gravel: N/A 
Training Accuracy of grass: 100% (20/20)
Training Accuracy (Overall): 80% (213/265)


In [47]:
def print_testing_accuracy(model, test_loader, classes, criterion):
    test_loss, class_correct, class_total = util.compute_accuracy(model=model, loader=test_loader, criterion=criterion)

    # average test loss
    test_loss = test_loss / len(test_loader.dataset)
    print('Test Loss: {:.6f}\n'.format(test_loss))

    for i in range(len(classes)):
        if class_total[i] > 0:
            print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
                classes[i], 100.0 * class_correct[i] / class_total[i],
                np.sum(class_correct[i]), np.sum(class_total[i])))
        else:
            print('Test Accuracy of %5s: N/A (no testing examples)' % (classes[i]))

    print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
        100. * np.sum(class_correct) / np.sum(class_total),
        np.sum(class_correct), np.sum(class_total)))


print_testing_accuracy(model=model, test_loader=test_loader, criterion=criterion, classes=CLASSES)

Test Loss: 0.541806

Test Accuracy of asphalt: 35% ( 6/17)
Test Accuracy of pavement: 97% (47/48)
Test Accuracy of gravel: N/A (no testing examples)
Test Accuracy of grass: 100% ( 2/ 2)

Test Accuracy (Overall): 82% (55/67)
