# Case study
#### Using RISF on real life data from bus sensors

In [14]:
import numpy as np
import pandas as pd
import datetime
import pickle

In [15]:
# Timestamp conversion copied from buses-data-demo.ipynb
VACT_EPOCH = datetime.datetime(year=2011,month=6,day=16,hour=5,minute=23,second=0)
VACT_TIMESTAMP = 1308194580
assert datetime.datetime.fromtimestamp(VACT_TIMESTAMP) == VACT_EPOCH
MILISECOND = 1
SECOND = 1000 * MILISECOND
HOUR = 3600*SECOND
DAY = 24*HOUR
def getVactDate(value):
    return VACT_EPOCH + datetime.timedelta(milliseconds=int(value))

In [18]:
# Loading pandas DataFrame with recorded values from sensors and
# Adding column with transformed timestamp
# Note that this data is ignored by git, so you have to make a copy on your own
# Also the excel file seems to only contain data about the bus number 369
# TODO Remove these links
# Link to data:
# https://halmstaduniversity.box.com/s/rtm3o8dzdt4o0hxr5sredhb1y06qsi8u

def load_data_frames(number):
    file_path = f"../data/bus/data-{number}.pickle"
    df = pickle.load(open(file_path,"rb"))
    df.insert(0, 'tstamp', df.Timestamp.apply(getVactDate))
    globals()[f"DATA_{number}"] = df

In [25]:
# For each number generates DATA_(number) dataframe
buses_numbers = ['369', '370', '371', '372', '375']
for number in buses_numbers:
    load_data_frames(number)

In [26]:
# Example DataFrame
DATA_369

Unnamed: 0,tstamp,Timestamp,WetTankAirPressure,LongitudAcc,EngineSpeed,Fuel Rate,Engine Load,Boost Pressure,EngineAirInletPressure,AcceleratorPedalPos,VehicleSpeed,BrakePedalPos
0,2012-04-15 08:23:47.763,2.627645e+10,4.27490,0.0,0.000,0.000000,0.0,0.000000,100.0,0.0,0.000000,0.0
1,2012-04-15 08:23:48.457,2.627645e+10,4.27490,0.0,0.000,0.000000,0.0,0.000000,100.0,0.0,0.000000,0.0
2,2012-04-15 08:23:49.550,2.627645e+10,4.27490,0.0,0.000,0.000000,0.0,0.000000,100.0,0.0,0.000000,0.0
3,2012-04-15 08:23:50.715,2.627645e+10,4.27490,0.0,0.000,0.000000,0.0,0.000000,100.0,0.0,0.000000,0.0
4,2012-04-15 08:23:51.435,2.627645e+10,4.27490,0.0,0.000,0.000000,0.0,0.000000,100.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10908295,2014-01-31 19:27:06.824,8.299465e+10,11.65255,-0.2,596.500,4.081143,24.0,0.008618,102.0,0.0,2.331882,0.0
10908296,2014-01-31 19:27:07.994,8.299465e+10,11.58360,-0.4,616.750,3.430526,19.5,0.000000,102.0,0.0,1.796760,11.2
10908297,2014-01-31 19:27:08.672,8.299465e+10,11.51465,0.0,573.000,3.726261,20.5,0.000000,102.0,0.0,0.000000,18.4
10908298,2014-01-31 19:27:09.790,8.299465e+10,11.44570,0.0,617.500,3.726261,20.0,0.000000,102.0,0.0,0.000000,0.0


In [59]:
DATA_369.shape

(10908300, 12)

In [61]:
DATA_369.shape[0] / (4 * 60)

45451.25

In [27]:
# Example entry
DATA_369.iloc[500]

tstamp                    2012-04-15 08:32:07.634000
Timestamp                              26276947634.0
WetTankAirPressure                          10.27355
LongitudAcc                                      0.1
EngineSpeed                                  551.125
Fuel Rate                                   6.801905
Engine Load                                     41.5
Boost Pressure                                   0.0
EngineAirInletPressure                         102.0
AcceleratorPedalPos                             19.2
VehicleSpeed                                1.726452
BrakePedalPos                                    0.0
Name: 500, dtype: object

In [56]:
DATA_369.dtypes

tstamp                    datetime64[ns]
Timestamp                        float64
WetTankAirPressure               float64
LongitudAcc                      float64
EngineSpeed                      float64
Fuel Rate                        float64
Engine Load                      float64
Boost Pressure                   float64
EngineAirInletPressure           float64
AcceleratorPedalPos              float64
VehicleSpeed                     float64
BrakePedalPos                    float64
dtype: object

In [52]:
#Example plot
import altair as alt
start = 26000
alt.Chart(DATA_369.iloc[start:start + 2000]).mark_line().encode(
    x='tstamp',
    y='WetTankAirPressure',
).properties(
    width=800
).interactive()

In [53]:
# TODO Remember about changing seed and keeping the seeds consistant
SEED = 23

In [71]:
column_data = DATA_369['WetTankAirPressure'].values[:1000]
X = np.array(column_data)

column_data = DATA_369['WetTankAirPressure'].values[1000:2000]
Y = np.array(column_data)

In [72]:
from risf.forest import RandomIsolationSimilarityForest

RISF = RandomIsolationSimilarityForest(random_state=SEED)

RISF.fit(X)

RISF.predict(Y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,