In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

import cloudprocessing.dataprocessing as dp
import cloudprocessing.rnn as rnn
import cloudprocessing.util as util
import config as config
import crashmodel



In [2]:
df = dp.get_dataframe()
# df = pd.read_json(raw_data)

Couldn't get Data, retrying ...


  return pd.read_json(raw_data)


In [3]:
df = dp.pre_processing(df)

Asphalt Data points:  10089
Pavement Data points:  3485
Gravel Data points:  0
Grass Data points:  20511
Crash Data points:  6381


In [4]:
def data_preparation(df):
    df.dropna(subset=['terrain'], inplace=True)

    df['time_second'] = df['time'].map(lambda x: pd.Timestamp(x).floor(freq='S'))
    df['time'] = df['time'].map(pd.Timestamp.timestamp)

    grouped = df.groupby([df.trip_id, df.time_second])  # grouped.get_group(1)
    x, y = [], []

    # data verification
    data_errors = ['GOOD', 'LONG', 'SHORT']
    data_checking = [0, 0, 0]
    total_samples, actual_length = 0, 0

    for i, (trip_seconds, table) in enumerate(grouped):
        train_input = table.drop(columns=['terrain', 'trip_id', 'crash', 'time_second', 'latitude', 'longitude'])

        train_input = train_input.to_numpy()

        total_samples += 1
        input_length = len(train_input)
        actual_length += input_length
        if input_length == config.batch_size:
            data_checking[0] += 1
        elif input_length > config.batch_size:
            data_checking[1] += 1
            train_input = train_input[:config.batch_size]
        else:
            data_checking[2] += 1
            n_missing_rows = config.batch_size - len(train_input)
            for _ in range(n_missing_rows):
                fake_array = [1] * config.n_training_cols
                train_input = np.append(train_input, fake_array)

        train_target = table.crash.min()

        x.append(train_input)
        y.append(train_target)

    for i in range(len(data_errors)):
        print(data_errors[i], ": ", data_checking[i])

    print('Mean Frequency is: %f Hz' % (actual_length / total_samples))

    return np.array(x), np.array(y)

X, Y = data_preparation(df)

GOOD :  0
LONG :  0
SHORT :  1907
Mean Frequency is: 17.873623 Hz


In [5]:
def gen_dataloader(x, y, test_size=0.2, random_state=0):
    scaler = StandardScaler()
    scaler.fit(x)
    x = scaler.transform(x)

    x = torch.tensor(x)
    y = torch.tensor(y)

    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=test_size, random_state=random_state)

    train_x = torch.Tensor(train_x)
    train_y = F.one_hot(train_y.long(), 2).to(torch.float32)

    test_x = torch.Tensor(test_x)
    test_y = F.one_hot(test_y.long(), 2).to(torch.float32)

    train_dataset = TensorDataset(train_x, train_y)
    test_dataset = TensorDataset(test_x, test_y)

    train_loader = DataLoader(train_dataset)
    test_loader = DataLoader(test_dataset)

    return train_loader, test_loader


train_loader, test_loader = gen_dataloader(X, Y)  # with default values

In [6]:
from torch import optim


def train_model(model, train_loader, num_epochs=config.num_training_epochs, lr=config.learning_rate,
                momentum=config.momentum):
    criterion = nn.CrossEntropyLoss()
    # criterion = nn.MSELoss()
    # criterion = nn.NLLLoss()

    # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print("Training STARTED")

    for e in range(0, num_epochs):
        model.train()  # set the model in training mode
        total_train_loss = 0  # initialize the total training and validation loss

        for i, (training_input, target) in enumerate(train_loader):  # loop over the training set
            hidden = model.initHidden()
            model.zero_grad()

            output = [.0, .0]
            for data_row in training_input:
                for i in range(0, len(data_row), config.n_training_cols):
                    model_input = data_row[None, i:i + config.n_training_cols].float()
                    output, hidden = model(model_input, hidden)

            optimizer.zero_grad()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            # add the loss to the total training loss so far and calculate the number of correct predictions
            total_train_loss += loss.item()

        if (e + 1) % 10 == 0:
            print("Epoch", e + 1, "Training Loss:", total_train_loss)

    print("Training FINISHED")

    return model, criterion


model = rnn.RNN(config.n_training_cols, config.n_hidden_layers, 2)
model, criterion = train_model(model=model, train_loader=train_loader)


Training STARTED
Epoch 10 Training Loss: 1229.7249710447095
Epoch 20 Training Loss: 723.936374636949
Training FINISHED


In [7]:
def print_training_accuracy(model, train_loader, criterion):
    training_loss, tp, fp, fn, tn, total = util.compute_accuracy_confusion_matrix(model=model, loader=train_loader,
                                                                                  criterion=criterion)
    # average training loss
    training_loss = training_loss / len(train_loader.dataset)
    print('Training Loss: {:.6f}\n'.format(training_loss))
    
    print('True Positive:  %2d%% (%2d/%2d)' % (100. * tp / total, tp, total))
    print('False Positive: %2d%% (%2d/%2d)' % (100. * fp / total, fp, total))
    print('False Negative: %2d%% (%2d/%2d)' % (100. * fn / total, fn, total))
    print('True Negative:  %2d%% (%2d/%2d)' % (100. * tn / total, tn, total))

    print('Training Accuracy (Overall): %2d%% (%2d/%2d)' % (100. * (tp + tn) / total, tp + tn, total))


print_training_accuracy(model=model, train_loader=train_loader, criterion=criterion)

Training Loss: 0.482522

True Positive:   0% ( 0/1525)
False Positive:  0% ( 0/1525)
False Negative: 19% (297/1525)
True Negative:  80% (1228/1525)
Training Accuracy (Overall): 80% (1228/1525)


In [8]:
def print_testing_accuracy(model, train_loader, criterion):
    testing_loss, tp, fp, fn, tn, total = util.compute_accuracy_confusion_matrix(model=model, loader=train_loader,
                                                                                  criterion=criterion)
    # average training loss
    testing_loss = testing_loss / len(train_loader.dataset)
    print('Testing Loss: {:.6f}\n'.format(testing_loss))
    
    print('True Positive:  %2d%% (%2d/%2d)' % (100. * tp / total, tp, total))
    print('False Positive: %2d%% (%2d/%2d)' % (100. * fp / total, fp, total))
    print('False Negative: %2d%% (%2d/%2d)' % (100. * fn / total, fn, total))
    print('True Negative:  %2d%% (%2d/%2d)' % (100. * tn / total, tn, total))

    print('Testing Accuracy (Overall): %2d%% (%2d/%2d)' % (100. * (tp + tn) / total, tp + tn, total))


print_testing_accuracy(model=model, train_loader=train_loader, criterion=criterion)

Testing Loss: 0.482522

True Positive:   0% ( 0/1525)
False Positive:  0% ( 0/1525)
False Negative: 19% (297/1525)
True Negative:  80% (1228/1525)
Testing Accuracy (Overall): 80% (1228/1525)
