Import needed set of packages

In [None]:
import datetime
import scipy
import hampel
import pandas as pd
import matplotlib.pyplot as plt
from tools import rednt_tools as tools  # custom helper tools library

Read the data from the CSV files with timestamp as index column

In [None]:
v_df = pd.read_csv(tools.VIBRATION_FILENAME, sep=tools.SEP, header=None,
                   names=tools.V_HEADER_NAMES, index_col=[0], parse_dates=[0],
                   date_parser=tools.date_parser)

t_df = pd.read_csv(tools.TEMPERATURE_FILENAME, sep=tools.SEP, header=None,
                   names=tools.T_HEADER_NAMES, index_col=[0], parse_dates=[0],
                   date_parser=tools.date_parser)

Make sure that both of the data are sorted

In [None]:
v_df = v_df.sort_values(by='timestamp')
t_df = t_df.sort_values(by='timestamp')

Determine some timestamp characterstics - start time, end time and time horizon of signals

In [None]:
timestamp_chars = {
    'vibration': {
        'start': v_df.index.to_series()[0],
        'end': v_df.index.to_series()[-1],
        'horizon': v_df.index.to_series()[-1] - v_df.index.to_series()[0]
    },
    'temperature': {
        'start': t_df.index.to_series()[0],
        'end': t_df.index.to_series()[-1],
        'horizon': t_df.index.to_series()[-1] - v_df.index.to_series()[0]
    }
}

print("Timestamp characteristics:")
print(timestamp_chars)

Calculate signal timestamp differences between samples and some statistics about it

In [None]:
v_timestamp_diff = v_df.index.to_series().diff()
t_timestamp_diff = t_df.index.to_series().diff()

timestamp_diff = pd.concat([v_timestamp_diff, t_timestamp_diff])
timedelta_value_counts = timestamp_diff.value_counts()

print('Timestamp differences:')
print(timedelta_value_counts)

timedelta_statistics = {
    'mean': timestamp_diff.mean(),
    'median': timestamp_diff.median(),
    'mode': timestamp_diff.mode(),
    'max': timestamp_diff.max(),
    'min': timestamp_diff.min()
}

print('Timestamp differences statistics:')
print(timedelta_statistics)

Deduce sampling time with timedelta as the median from the diffs

In [None]:
timedelta = timedelta_statistics['median']

print("Median Timedelta that we choose as sampling time: {}".format(timedelta))
sampling_time = timedelta

Resample the data to use deduced sampling rate

In [None]:
v_df = v_df.resample(sampling_time).mean()
t_df = t_df.resample(sampling_time).mean()

Remove outliers using hampel filter with remembering indices of outliers

In [None]:
USE_HAMPEL = False

if USE_HAMPEL:
    hampel_args = {
        'window_size': 50,
        'n': 3
    }

    v_outliers = hampel.hampel(v_df.squeeze(), **hampel_args)
    t_outliers = hampel.hampel(t_df.squeeze(), **hampel_args)
    print("Number of outliers in data: vibration: "
        "{}%, temperature: {}%".format(len(v_outliers)*100/v_df.shape[0],
                                        len(t_outliers)*100/t_df.shape[0]))
    v_df[v_df.columns[0]] = hampel.hampel(v_df.squeeze(), **hampel_args, imputation=True)
    t_df[t_df.columns[0]] = hampel.hampel(t_df.squeeze(), **hampel_args, imputation=True)

Interpolate the signals

In [None]:
USE_INTERPOLATION = True

if USE_INTERPOLATION:
    interp_args = {
        'method': 'linear',
        'limit': 4,
        'limit_area': 'inside'
    }
    v_df = v_df.interpolate(**interp_args)
    t_df = t_df.interpolate(**interp_args)

Merge dataframes based on the closest timestamp and limit to smallest length

In [None]:
merge_tolerance = sampling_time
if len(v_df) > len(t_df):
    df = pd.merge_asof(t_df, v_df, on='timestamp', tolerance=merge_tolerance)
else:
    df = pd.merge_asof(v_df, t_df, on='timestamp', tolerance=merge_tolerance)

Set index after merge operation to timestamp again

In [None]:
df = df.set_index('timestamp')

Plot the timeseries data

In [None]:
tools.plot_timeseries_data(df)

Split the data into 1-hour length time horizons

In [None]:
df_groups = df.groupby([(df.index - df.index[0]).astype('timedelta64[1h]')])
groups_num = len(df_groups)

print("Number of groups: {}".format(groups_num))

Interpolate group timeseries to fill out some missing values