# Data preprocessing

## Compute the consumption by hour

In [1]:
import pandas as pd
from pathlib import Path
from pprint import pprint
from datetime import datetime, timedelta # to convert timestamp/ID to human readable format

In [2]:
# load CSV files
raw_data = Path('rawdata')
tables = {}
for i, file in enumerate(raw_data.iterdir()):
    name = file.name.split('.')[0]
    tables[f"{name}"] = pd.read_csv(f"rawdata/{file.name}", header = None, names=('ID', name))

## Date transformation functions

In [3]:
def ticks_to_iso(ticks):
    """Convert .NET DateTime.Ticks to an ISO 8601 formatted string."""
    ticks_epoch = datetime(1, 1, 1)  # .NET starts from year 0001-01-01
    date_time = ticks_epoch + timedelta(microseconds=ticks / 10)  # Convert ticks to microseconds
    return date_time.isoformat()

def ticks_to_ymd(ticks):
    """Convert .NET DateTime.Ticks to an ISO 8601 formatted string."""
    ticks_epoch = datetime(1, 1, 1)  # .NET starts from year 0001-01-01
    date_time = ticks_epoch + timedelta(microseconds=ticks / 10)  # Convert ticks to microseconds
    return date_time.strftime("%Y-%m-%d")

# Adds date and ISO date to table in tables dictionary
def add_dates(table_name): 
    tables[table_name]["date"] = tables[table_name].ID.apply(ticks_to_ymd)
    tables[table_name]["iso_date"] = tables[table_name].ID.apply(ticks_to_iso)

# Limits the ISO date string to just date and hour e.g. "2010-02-16T12:48" -> "2010-02-16T12"
def add_hour(df):
    df["dateHour"] = df.iso_date.apply(lambda x: x[:13])
    

In [9]:
# Join fuelDensity and fuelVolumeFlowRate
df_fuel = tables["fuelDensity"].merge(tables["fuelVolumeFlowRate"], on = "ID")

# 1) Add date, hour cloumns and fuelMassFlowRate
df_fuel["iso_date"] = df_fuel.ID.apply(ticks_to_iso)
df_fuel["dateHour"] = df_fuel.ID.apply(lambda x: ticks_to_iso(x)[:13])
df_fuel["fuelMassFlowRate"] = df_fuel.fuelDensity * df_fuel.fuelVolumeFlowRate

# 2) drop unecessary cols
df_fuel = df_fuel.drop('ID', axis=1)
df_fuel = df_fuel.drop(["fuelDensity","fuelVolumeFlowRate"], axis=1)

# TIMEDIFF

# 3) Change iso_date type to timestamp in order to compute time differences
df_fuel["iso_date"] = pd.to_datetime(df_fuel["iso_date"], format='ISO8601')

# 4) Drop lines where fuel flow is 0, IMPORTANT
#    time diffs must be calculated only for the engine runtime
df_fuel = df_fuel[df_fuel["fuelMassFlowRate"] != 0]

# 5) Add time diff
df_fuel["timeDiff"] = df_fuel.iso_date.diff()

# 6) Fill border line with 0 (only a small error)
df_fuel.timeDiff = df_fuel.timeDiff.fillna(pd.Timedelta(seconds=0))

# 7) Despite the efort to count make diffs only from runtime some non-sense timediffs occured.
#    Therefore I am going to filter them out. Sampling frequency was around 1.02 s
df_fuel = df_fuel[df_fuel["timeDiff"].dt.total_seconds() < 1.2]

# Compute fuel consumption for every interval
df_fuel["fuelMassConsumption"] = df_fuel.fuelMassFlowRate * df_fuel.timeDiff.dt.total_seconds()
df_fuel

Unnamed: 0,iso_date,dateHour,fuelMassFlowRate,timeDiff,fuelMassConsumption
4,2010-02-16 12:48:13.560688,2010-02-16T12,0.266306,0 days 00:00:00,0.000000
5,2010-02-16 12:48:14.582064,2010-02-16T12,0.261940,0 days 00:00:01.021376,0.267539
6,2010-02-16 12:48:15.603488,2010-02-16T12,0.265296,0 days 00:00:01.021424,0.270979
7,2010-02-16 12:48:16.626888,2010-02-16T12,0.263480,0 days 00:00:01.023400,0.269645
8,2010-02-16 12:48:17.649280,2010-02-16T12,0.262434,0 days 00:00:01.022392,0.268311
...,...,...,...,...,...
1627319,2010-04-12 22:25:12.142864,2010-04-12T22,0.189356,0 days 00:00:01.023416,0.193790
1627320,2010-04-12 22:25:13.164264,2010-04-12T22,0.190873,0 days 00:00:01.021400,0.194958
1627321,2010-04-12 22:25:14.186664,2010-04-12T22,0.197969,0 days 00:00:01.022400,0.202404
1627322,2010-04-12 22:25:15.208072,2010-04-12T22,0.208952,0 days 00:00:01.021408,0.213426


In [10]:
df_daily_consumption = df_fuel[["dateHour","fuelMassConsumption"]].groupby("dateHour").sum().apply(lambda x: x*0.001)


In [11]:
display(df_daily_consumption)
display(df_daily_consumption.info())
display(df_daily_consumption.describe())

Unnamed: 0_level_0,fuelMassConsumption
dateHour,Unnamed: 1_level_1
2010-02-16T12,0.309810
2010-02-16T13,1.672615
2010-02-16T14,1.663355
2010-02-16T15,0.329521
2010-02-17T06,0.520858
...,...
2010-04-12T18,1.772452
2010-04-12T19,1.901760
2010-04-12T20,0.596245
2010-04-12T21,2.176730


<class 'pandas.core.frame.DataFrame'>
Index: 682 entries, 2010-02-16T12 to 2010-04-12T22
Data columns (total 1 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   fuelMassConsumption  682 non-null    float64
dtypes: float64(1)
memory usage: 10.7+ KB


None

Unnamed: 0,fuelMassConsumption
count,682.0
mean,1.330281
std,0.686605
min,0.003561
25%,0.675496
50%,1.486395
75%,2.013399
max,2.312076


In [12]:
df_fuel[df_fuel["dateHour"] == '2010-02-17T06']

Unnamed: 0,iso_date,dateHour,fuelMassFlowRate,timeDiff,fuelMassConsumption
8640,2010-02-17 06:44:08.177608,2010-02-17T06,0.325304,0 days 00:00:01.022400,0.332591
8641,2010-02-17 06:44:09.200016,2010-02-17T06,0.306870,0 days 00:00:01.022408,0.313747
8642,2010-02-17 06:44:10.221400,2010-02-17T06,0.297171,0 days 00:00:01.021384,0.303525
8643,2010-02-17 06:44:11.244800,2010-02-17T06,0.291570,0 days 00:00:01.023400,0.298393
8644,2010-02-17 06:44:12.266376,2010-02-17T06,0.291610,0 days 00:00:01.021576,0.297902
...,...,...,...,...,...
9567,2010-02-17 06:59:55.844960,2010-02-17T06,0.584873,0 days 00:00:01.022392,0.597969
9568,2010-02-17 06:59:56.866344,2010-02-17T06,0.588383,0 days 00:00:01.021384,0.600965
9569,2010-02-17 06:59:57.887776,2010-02-17T06,0.576337,0 days 00:00:01.021432,0.588689
9570,2010-02-17 06:59:58.908168,2010-02-17T06,0.569486,0 days 00:00:01.020392,0.581099


In [13]:
df_fuel.describe()

Unnamed: 0,iso_date,fuelMassFlowRate,timeDiff,fuelMassConsumption
count,1626840,1626840.0,1626840,1626840.0
mean,2010-03-16 06:06:46.706343168,0.5455613,0 days 00:00:01.022207642,0.5576771
min,2010-02-16 12:48:13.560688,0.0215926,0 days 00:00:00,0.0
25%,2010-03-02 13:38:17.303463936,0.544379,0 days 00:00:01.021376,0.5564628
50%,2010-03-15 21:02:59.776580096,0.5801077,0 days 00:00:01.022344,0.5929853
75%,2010-03-29 07:13:28.987108096,0.6053773,0 days 00:00:01.022432,0.6188304
max,2010-04-12 22:25:16.230472,0.8806616,0 days 00:00:01.056008,0.9074478
std,,0.1102407,0 days 00:00:00.001420576,0.1126915


In [16]:
df_daily_consumption.columns

Index(['fuelMassConsumption'], dtype='object')

In [17]:
df_daily_consumption.to_csv('daily_consumption.csv',header=True)

# Filter out timediffs > 1.5s