# Case study
#### Using RISF on real life data from bus sensors

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
import datetime
import pickle
from csutils import *

In [None]:
# Timestamp conversion copied from buses-data-demo.ipynb
VACT_EPOCH = datetime.datetime(year=2011,month=6,day=16,hour=5,minute=23,second=0)
VACT_TIMESTAMP = 1308194580
assert datetime.datetime.fromtimestamp(VACT_TIMESTAMP) == VACT_EPOCH
MILISECOND = 1
SECOND = 1000 * MILISECOND
HOUR = 3600*SECOND
DAY = 24*HOUR
def getVactDate(value):
    return VACT_EPOCH + datetime.timedelta(milliseconds=int(value))

In [None]:
# Loading pandas DataFrame with recorded values from sensors and
# Adding column with transformed timestamp
# Note that this data is ignored by git, so you have to make a copy on your own
# Also the excel file seems to only contain data about the bus number 369
# TODO Remove these links
# Link to data:
# https://halmstaduniversity.box.com/s/rtm3o8dzdt4o0hxr5sredhb1y06qsi8u

DATA_369 = pickle.load(open("../data/bus/data-369.pickle","rb"))
DATA_370 = pickle.load(open("../data/bus/data-370.pickle","rb"))
DATA_371 = pickle.load(open("../data/bus/data-371.pickle","rb"))
DATA_372 = pickle.load(open("../data/bus/data-372.pickle","rb"))
DATA_375 = pickle.load(open("../data/bus/data-375.pickle","rb"))


In [None]:
DATA_369 = DATA_369.set_index('tstamp')
DATA_369.sort_index(inplace = True)
DATA_370 = DATA_370.set_index('tstamp')
DATA_370.sort_index(inplace = True)
DATA_371 = DATA_371.set_index('tstamp')
DATA_371.sort_index(inplace = True)
DATA_372 = DATA_372.set_index('tstamp')
DATA_372.sort_index(inplace = True)
DATA_375 = DATA_375.set_index('tstamp')
DATA_375.sort_index(inplace = True)

In [None]:
DATA_369.drop(columns=['Timestamp'], inplace = True)
DATA_370.drop(columns=['Timestamp'], inplace = True)
DATA_371.drop(columns=['Timestamp'], inplace = True)
DATA_372.drop(columns=['Timestamp'], inplace = True)
DATA_375.drop(columns=['Timestamp'], inplace = True)

## Experiments

In [None]:
columns = ['WetTankAirPressure', 'EngineAirInletPressure']

X_dfs = [DATA_370,DATA_371,DATA_372,DATA_375]
y_dfs = [DATA_369]

for X in X_dfs:
    X = X.loc[:,columns]

START = pd.Timestamp('2012-04-15 08:23:47.763000') 
END = START + pd.Timedelta('4W')

FREQ = pd.Timedelta('4T')  
WINDOW_SIZE = pd.Timedelta('4T')

def bin10(column): return 10
AVG_SIZE = '1S'

X_NUMERIC = []
X_HISTOGRAM = []
X_TSIRES = []

y_NUMERIC = []
y_HISTOGRAM = []
y_TSIRES = []
y_TSTAMPS = []

for X in X_dfs:
    for win in extract_time_windows(X, WINDOW_SIZE, start=START, end=END, freq=FREQ):
        n = win.shape[0]
        if n < 20:
            continue
        X_NUMERIC.append(get_numeric_from_window(win))
        X_HISTOGRAM.append(get_histograms_from_window(win, way_of_binarization=bin10))
        X_TSIRES.append(get_time_series_from_window(win,AVG_SIZE=AVG_SIZE))


In [None]:
for y in y_dfs:
    for timed_win in extract_time_windows(y, WINDOW_SIZE, timestamp = True, start=START, end=END, freq=FREQ):
        tstamp, win = timed_win
        n = win.shape[0]
        if n < 20:
            continue
        y_TSTAMPS.append(tstamp)
        y_NUMERIC.append(get_numeric_from_window(win))
        y_HISTOGRAM.append(get_histograms_from_window(win, way_of_binarization=bin10))
        y_TSIRES.append(get_time_series_from_window(win,AVG_SIZE=AVG_SIZE))

In [None]:
np.nan_to_num(X_NUMERIC, copy=False)
np.nan_to_num(y_NUMERIC, copy=False)

In [None]:
for i in range(len(columns)):
    num_col = X_NUMERIC[:][i]
    num_col = np.array(num_col)
    print(num_col[:10])
    print(num_col.shape)
    hist_col = X_HISTOGRAM[:][i]
    hist_col = np.array(hist_col, dtype=object)
    print(hist_col[:10])
    print(hist_col.shape)
    tser_col = X_TSIRES[:][i]
    tser_col = np.array(tser_col)
    print(tser_col[:10])
    print(tser_col.shape)


In [None]:
from risf.risf_data import RisfData
from risf.forest import *
from risf.distance import * 

In [None]:
from importlib import reload
from risf.distance_functions import * 


In [None]:
%load_ext autoreload
%autoreload 2
from risf.distance_functions import * 
ex_X = RisfData()
ex_y = RisfData()


class MyWasser():
    def __init__(self) -> None:
        self.results = {}

    def __call__(self, *args, **kwargs):
        return self.dist(*args, **kwargs)
    
    def dist(self, hist1, hist2):
        bins1, values1 = hist1
        bins2, values2 = hist2

        #dist = wasserstein_distance(values1, values2, bins1, bins2)
        dist = wasserstein_distance(values1, values2)
        self.results["dist"] = dist
        print(dist)
        return dist

# class MyEuclidean():
#     def __init__(self) -> None:
#         self.results = {}

#     def __call__(self, *args, **kwargs):
#         return self.dist(*args, **kwargs)

#     def adjust(self, values1, bins1, values2, bins2):
#         min1, max1 = min(bins1), max(bins1)
#         min2, max2 = min(bins2), max(bins2)

#         bins = np.arange(min(min1, min2), max(max1, max2) + 1)

#         values1_new = [0] * len(bins)
#         for bin in bins1:
#             i = np.where(bins == bin)[0][0]
#             j = np.where(bins1 == bin)[0][0]
#             values1_new[i] = values1[j]
#         values2_new = [0] * len(bins)
#         for bin in bins2:
#             i = np.where(bins == bin)[0][0]
#             j = np.where(bins2 == bin)[0][0]
#             values2_new[i] = values2[j]
#         return values1_new, values2_new

#     def dist(self, hist1, hist2):
#         bins1, values1 = hist1
#         bins2, values2 = hist2
#         if not np.array_equal(bins1, bins2):
#             bins1, bins2 = np.array(bins1), np.array(bins2)
#             values1, values2 = self.adjust(values1, bins1, values2, bins2)
#         values1, values2 = np.array(values1), np.array(values2)

#         dist = np.linalg.norm(values1 - values2) ** 2
#         self.results["dist"] = dist
#         return dist


# distances_numerical = [SelectiveDistance(projection_func = euclidean_projection, min_n=1, max_n=3),
#                         SelectiveDistance(projection_func = cosine_projection, min_n=1, max_n=3)]

distances_histogram = [
    TrainDistanceMixin(distance = MyWasser()),
    ##TrainDistanceMixin(distance = MyEuclidean())
    ]

#distanecs_tser = [TrainDistanceMixin(distance = TSEuclidean())]

for i in range(len(columns)):
    # num_col = X_NUMERIC[:][i]
    # num_col = np.array(num_col)
    # ex_X.add_data(num_col, distances_numerical)
    hist_col = X_HISTOGRAM[:][i]
    hist_col = np.array(hist_col, dtype=object)
    ex_X.add_data(hist_col, distances_histogram)
    # tser_col = X_TSIRES[:][i]
    # tser_col = np.array(tser_col)
    # ex_X.add_data(tser_col, distanecs_tser)
    
    # num_col = y_NUMERIC[:][i]
    # num_col = np.array(num_col)
    # ex_y.add_data(num_col, distances_numerical)
    hist_col = y_HISTOGRAM[:][i]
    hist_col = np.array(hist_col, dtype=object)
    ex_y.add_data(hist_col, distances_histogram)
    # tser_col = y_TSIRES[:][i]
    # hist_col = np.array(tser_col)
    # ex_y.add_data(tser_col, distanecs_tser)


In [None]:
ex_X.precompute_distances()
ex_y.precompute_distances()

In [None]:
forest = RandomIsolationSimilarityForest(distances = ex_X.distances).fit(ex_X)

In [None]:
results = forest.predict(ex_y)

In [None]:
#Nr. Mode        Start time End time   Failure alarm
#1  planned     2012-02-02 2012-02-09 2012-02-01
#2  planned     2012-03-01 2012-03-02 2012-02-29
#3  unplanned   2012-03-05 2012-03-05 2012-03-03
#4  unplanned   2012-03-16 2012-03-19 2012-03-14
#5  unplanned   2012-04-01 2012-04-13 2012-03-31
#6  planned     2012-05-29 2012-05-29 2012-05-29
#7  unplanned   2012-07-10 2012-07-23 2012-07-10
#8  planned     2012-08-21 2012-08-23 2012-08-21
#9  unplanned   2012-10-08 2012-10-08 2012-10-08
#10 planned     2012-12-27 2012-12-28 2012-12-25
#11 planned     2013-02-19 2013-03-06 2013-02-19
#12 unplanned   2013-04-23 2013-04-24 2013-04-22
#13 unplanned   2013-04-29 2013-04-29 -
#14 unplanned   2013-07-11 2013-07-19 2013-07-11
#15 planned     2013-08-12 2013-08-16 2013-08-12
#16 unplanned   2013-12-11 2013-12-19 2013-12-10
#17 planned     2014-01-09 2014-01-09 2014-01-09
#18 planned     2014-01-31 2014-02-11 2014-01-31
#19 unplanned   2014-02-23 2014-02-23 2014-02-22
#20 unplanned   2014-03-13 2014-03-20 2014-03-12
#21 unplanned   2014-04-14 2014-04-14 2014-04-13
#22 unplanned   2014-04-20 2014-04-20 -
#23 unplanned   2014-06-08 2014-06-08 2014-06-05
#24 unplanned   2014-08-14 2014-08-15 -
#25 planned     2014-09-03 2014-09-05 2014-09-03
#26 planned     2014-11-28 2014-11-28 -