In [None]:
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta
from helper import *

In [None]:
t = "60Min"
long_interpolation = 4

## Import dataframes

In [None]:
def load_csv(file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
    else:
        df = pd.DataFrame()
    return df

def combine_dataframes(path_comb, path_live, csv_comb="combined.csv"):
    
    df_combined = load_csv(path_comb + csv_comb)

    for file in os.listdir(path_live):
        if str(file).endswith(".csv"):
            df = pd.read_csv(f"{path_live}{file}", delimiter=",")
            df_combined = df_combined.append(df)
    return df_combined
        
file = "combined.csv"
path_comb = "../data/"
path_live = "../data/"
    
df = combine_dataframes(path_comb, path_live, csv_comb=file)

## Dataset preparation

### Drop duplicate values

In [None]:
print(f"{round((1 -len(df.drop_duplicates()) / len(df)) * 100, 2)} % of entries are duplicates")

In [None]:
df = df.drop_duplicates()

### Update timestamp

In [None]:
df = update_timestamps(df)

### Drop irrelevant columns

In [None]:
df = df.drop(columns=["link", "geo_point_2d", "description"])

### Resample timestamp intervall

In [None]:
df = resample_timestamp(df, t=t, how="mean", ignore_cols=['published', "description"])

In [None]:
plt.figure(figsize=(14,6))

ax = sns.scatterplot(data = df.dropna(), x="published", y="auslastung_prozent", hue="title")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title("Auslastung Prozent")
plt.ylabel("Auslastung [%]")
plt.xlabel("Zeitspanne")
plt.show()

In [None]:
df = df.groupby(by=["id2", "name", "title"]).apply(mark_missing_values_windows)        
df = df.reset_index(drop=True)

In [None]:
df.loc[(df["missing"] == 0) & df.isna().any(axis=1)]

In [None]:
df.loc[df.isna().any(axis=1)]

In [None]:
df = interpolate_historic(df, 6)

In [None]:
df[df.isna().any(axis=1)]["missing"].value_counts()

In [None]:
df

### Interpolate data

In [None]:
n = len(df)

sub_df1 = df[:n]
sub_df1 = sub_df1.drop_duplicates()

ip = Interpolator()
sub_df2 = ip.interpolate(sub_df1, method="pad", t=t)

ip = Interpolator()
sub_df3 = ip.interpolate(sub_df1, method="linear", t=t)

ip = Interpolator()
sub_df4 = ip.interpolate(sub_df1, method="cubic", t=t)

In [None]:
sub_df2[sub_df2.isna().any(axis=1)]["missing"].value_counts()

In [None]:
df1 = sub_df1.loc[sub_df1["title"] == "Parkhaus Storchen"][:10]
df2 = sub_df2.loc[sub_df2["title"] == "Parkhaus Storchen"][:10]
df3 = sub_df3.loc[sub_df3["title"] == "Parkhaus Storchen"][:10]
df4 = sub_df4.loc[sub_df4["title"] == "Parkhaus Storchen"][:10]

plt.figure(figsize=(12,5))
plt.plot(df1["published"], df1["auslastung"], "o", label="Real values")
plt.plot(df2["published"], df2["auslastung"], "-", label="padding")
plt.plot(df3["published"], df3["auslastung"], "--", label="linear")
plt.plot(df4["published"], df4["auslastung"], "--", label="cubic")
plt.title("Interplation of missing values")
plt.xlabel("timestamp")
plt.ylabel("value")
plt.legend()
plt.show()

In [None]:
round(len(sub_df1) / len(sub_df2), 3)

In [None]:
df = sub_df2

In [None]:
df

Unique garages

In [None]:
names = list(df["title"].drop_duplicates())
names

In [None]:
df.info()

In [None]:
df[df.isna().any(axis=1)]

Check for impossible utilisation rates

In [None]:
len(df.loc[df["auslastung_prozent"] > 100])

In [None]:
len(df.loc[df["auslastung_prozent"] < 0])

In [None]:
plt.figure(figsize=(14,6))
ax = sns.scatterplot(data = df, x="published", y="auslastung_prozent", hue="title")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title("Auslastung Prozent")
plt.xlabel("Auslastung [%]")
plt.ylabel("Zeitspanne")
plt.show()

In [None]:
# df.to_csv("../data/processed/live_60.csv", index=False)

## Torch

In [None]:
import torch
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

In [None]:
sub_df = df.loc[df["title"]=="Parkhaus Rebgasse"]["free"]



In [None]:
import numpy as np
import torch

class TrainTestSplit():
    
    def __init__(self):
        pass

def sliding_window(data, seq_len):
    '''Creates a sliding window over the dataset'''
    X = []
    Y = []
    
    for i in range(len(data) - seq_len):
        x = data[i:(i+seq_len)].to_numpy()
        y = data[(i+seq_len):(i+seq_len+1)].to_numpy()
        X.append(x)
        Y.append(y)
    
    return np.array(X), np.array(Y)

train_seq_len = 24
test_seq_len = 6
train_ratio = 0.8

def train_test_split(df, train_ratio, train_seq_len):

    len_train = int(len(df) * train_ratio)
    
    X_train, Y_train = sliding_window(df[:len_train], train_seq_len)
    X_test, Y_test = sliding_window(df[len_train:], train_seq_len)
    
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = train_test_split(sub_df, train_ratio, train_seq_len)

In [None]:
# X_train = torch.from_numpy(X_train).float()
# Y_train = torch.from_numpy(Y_train).float()

In [None]:
from torch.utils.data import Dataset

class TimeSeriesDataSet(Dataset):
    def __init__(self, sequences, labels):
        self.labels = labels
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequences = self.sequences[idx]
        sequences = torch.from_numpy(self.sequences[idx]).float()
        
        labels = self.labels[idx]
        labels = torch.from_numpy(self.labels[idx]).float()

        return sequences, labels
    

In [None]:
train_set = TimeSeriesDataSet(X_train, Y_train)
test_set = TimeSeriesDataSet(X_test, Y_test)

In [None]:
len(train_set)

In [None]:
from torch.utils.data import DataLoader
# Set seed 
torch.manual_seed(42)
batch_size = 16

train_dataloader = DataLoader(
    dataset=train_set, 
    batch_size=batch_size, 
    shuffle=True)

test_dataloader = DataLoader(
    dataset=test_set, 
    batch_size=batch_size, 
    shuffle=False)

In [None]:
example_batch = iter(train_dataloader)
sequences, labels = example_batch.next()
np.shape(sequences), np.shape(labels)

In [None]:
sequences, labels

In [None]:

class LSTM_Forecaster(nn.Module):
    '''
    LSTM model for predicting timeseries
    Args:
        input_size (int): nr of input features
        hidden_size (int): hidden layer size
        num_layers (int): number of layers in the lstm
        seq_length (int): length of the prediction
    '''
    def __init__(self, input_size, hidden_size, seq_len, num_layers=2, dropout=0):
        super(LSTM_Forecaster, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.seq_len = seq_len
        self.dropout = dropout

        self.lstm = nn.LSTM(input_size = self.input_size, 
                            hidden_size = self.hidden_size,
                            num_layers = self.num_layers,
                            dropout = self.dropout,
                            bidirectional=False
#                             batch_first=True
                            )
        
        self.linear = nn.Linear(in_features = hidden_size, out_features = 1)
        
        self.hidden = self.reset_hidden_state()
        
        
    def reset_hidden_state(self):
        self.hidden = (
        torch.zeros(self.num_layers, self.seq_len, self.hidden_size),
        torch.zeros(self.num_layers, self.seq_len, self.hidden_size)
        )
        

    def forward(self, sequences):
        
        self.reset_hidden_state()
    
        lstm_out, self.hidden = self.lstm(
          sequences.view(len(sequences), self.seq_len, -1),
          self.hidden
        )
 
        last = lstm_out.view(self.seq_len, len(sequences), self.hidden_size)[-1]
        outputs = self.linear(last)
        
        return outputs

In [None]:
class LSTMForecaster2(nn.Module):
    '''
    LSTM model for predicting timeseries
    Args:
        input_size (int): nr of input features
        hidden_size (int): hidden layer size
        num_layers (int): number of layers in the lstm
        seq_length (int): length of the prediction
    '''
    def __init__(self, hidden_size, dropout=0):
        super(LSTMForecaster2, self).__init__()
        
       
        self.hidden_size = hidden_size
      
        self.dropout = dropout
        
        self.lstm1 = nn.LSTMCell(1, hidden_size)
        self.lstm2 = nn.LSTMCell(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size, 1)

        

    def forward(self, X, future=0):
        outputs = []
        n_samples = X.size(0)
        
        h_t = torch.zeros(n_samples, self.hidden_size, dtype=torch.float32)
        c_t = torch.zeros(n_samples, self.hidden_size, dtype=torch.float32)
        
        h_t2 = torch.zeros(n_samples, self.hidden_size, dtype=torch.float32)
        c_t2 = torch.zeros(n_samples, self.hidden_size, dtype=torch.float32)
    
        for input_t in X.split(1, dim=1):
            h_t, c_t = self.lstm1(input_t, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)
            
        for i in range(future):
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)
            
        
        outputs = torch.cat(outputs, dim=1)
        
        return output

In [None]:
from torch.optim import Adam
from torch.nn import MSELoss

input_size = 1 # number of feature collumns
hidden_size = 50 # hidden layer size in lstm
seq_length = 24 # nr of lags

model = LSTM_Forecaster(input_size, hidden_size, seq_length).to(device)

def train_model(model, train_dataloader, test_dataloader=None):
    
    criterion = MSELoss()
    optimiser = Adam(model.parameters(), lr=1e-3)
    num_epochs = 
    
    for epoch in range(num_epochs):
        print("epoch:", epoch)
        
        epoch_loss = 0
        
        for batch, (sequences, labels) in enumerate(train_dataloader):
            sequences = sequences.to(device)
            labels = labels.to(device)
        
            labels_pred = model(sequences)
            loss = criterion(labels_pred.float(), labels)
            
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            
            epoch_loss += loss.item()
            
#         if test_dataloader is not None:
#             for batch, (sequences, labels) in enumerate(test_dataloader):
#                 sequences = sequences.to(device)
#                 labels = labels.to(device)
#                 with torch.no_grad():
#                     labels_pred = model(sequences)
#                     loss = criterion(labels_pred.float(), labels)
            print(labels_pred)
        print(epoch_loss)
    print(labels_pred, labels)
                    
        
    
train_model(model, train_dataloader)

In [None]:
from torch.optim import Adam
from torch.nn import MSELoss

hidden_size = 50

model = LSTMForecaster2(hidden_size).to(device)

class ModelTrainer():
    
    def __init__(self, model):
        self.model = model
        self.criterion = MSELoss(reduction="mean")
        self.log = dict(
            train=[], 
            test=[]
        )
        
    def predict(self, x, y, future=0):
        with torch.no_grad():
            labels_pred = self.model(x, future)
            loss = self.criterion(labels_pred[:,-future:].float(), y)
            y = labels_pred.detach().numpy()
            return y

    def train_model(self, train_dataloader, test_dataloader=None, num_epochs = 100):

        optimiser = Adam(self.model.parameters(), lr=0.005)

        for epoch in range(num_epochs):
            print("epoch:", epoch)

            epoch_loss = 0

            for batch, (sequences, labels) in enumerate(train_dataloader):
                sequences = sequences.to(device)
                labels = labels.to(device)

                labels_pred = self.model(sequences)
#                 print(labels_pred.size())
                
                loss = self.criterion(labels_pred.float(), labels)
                

                optimiser.zero_grad()
                loss.backward()
                optimiser.step()

                epoch_loss += loss.item()
                
                
#                 self.predict(sequences, labels, 2)

#                 with torch.no_grad():
#                     future = 4
#                     labels_pred = self.model(sequences, future)
#                     loss = self.criterion(labels_pred[:,:-future].float(), labels)
#                     y = labels_pred.detach().numpy()
#     #                 print(y.shape)

            if test_dataloader is not None:
                for batch, (sequences, labels) in enumerate(test_dataloader):
                    sequences = sequences.to(device)
                    labels = labels.to(device)

            print(epoch_loss)
        
trainer = ModelTrainer(model)    
trainer.train_model(train_dataloader)

In [None]:
for batch, (sequences, labels) in enumerate(train_dataloader):
    sequences = sequences.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        labels_pred = trainer.model(sequences, 12)
        restults = labels_pred.detach().numpy()

In [None]:
i=5

sequences[i], labels[i], restults[i]

plt.figure(figsize=(10,5))
plt.plot(range(len(sequences[i])), sequences[i])
plt.plot(range(1, len(sequences[i])+1), labels[i], "--")
plt.plot(range(1, len(restults[i])+1), restults[i])
plt.legend()
plt.show()

In [None]:
labels, restults