# EV Clustering

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
from IMPORT_DATAFRAME_JSON_HDF5 import *
import seaborn as sns
import matplotlib.pyplot as plt

## Import jsons/hdf5

Data time window: 02.03.2017 - 28.04.2017

In [2]:
data = import_trasient_from_file("jsons/")

100% (1370 of 1370) |#####################| Elapsed Time: 0:00:17 Time: 0:00:17


In [3]:
len(data)

1882

In [4]:
data = map_transients_to_PQ_data(data, "HDF5", 20, ['P'])

100% (1882 of 1882) |#####################| Elapsed Time: 0:05:27 Time: 0:05:27


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1882 entries, 1491026501.53 to 1490970806.68
Data columns (total 10 columns):
begin_index                1882 non-null int64
begin_timestamp_string     1882 non-null object
filtered_signal            1882 non-null object
phase_num                  1882 non-null int64
raw_signal_current         1882 non-null object
raw_signal_voltage         1882 non-null object
three_first_peaks          1882 non-null object
three_first_peaks_index    1882 non-null object
transient_rise_gradient    1882 non-null float64
P                          1882 non-null object
dtypes: float64(1), int64(2), object(7)
memory usage: 161.7+ KB


## Explore the data

This chapter should give a overview of the used dataset. Therefore step one is to study the header of the dataset. Here the column *'begin_timestamp_float'* can be used as a index, make our're events easier to find and to compare with other data sources. 

In [6]:
length = len(data)
print("Number of data points: {}".format(length))
data.head(1)

Number of data points: 1882


Unnamed: 0_level_0,begin_index,begin_timestamp_string,filtered_signal,phase_num,raw_signal_current,raw_signal_voltage,three_first_peaks,three_first_peaks_index,transient_rise_gradient,P
begin_timestamp_float,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1491027000.0,701,01-04-2017_08:01:41_529732,"[7.9862e-06, 0.0259479208, -0.0389019368, 7.97...",2,"[0.6561279297, 0.7019042969, 0.7629394531, 0.7...","[289.647277832, 289.6991882324, 289.8029785156...","[26.198, 4.360, 2.076]","[754, 884, 1015]",245806.997513,"[105.77301788330078, 104.60313415527344, 105.3..."


In [7]:
transient_overview = pd.read_csv('transientlist.csv', sep=',', header=None)
transient_overview[0] = transient_overview[0].apply(convert_to_datetime)
len(transient_overview)

1370

In [8]:
transient_overview.sort_values(0).head(1)

Unnamed: 0,0,1,2,3
733,2017-01-04 08:01:41.528330,7681,[2],26.198


In [9]:
data.sort_values('begin_timestamp_string').head(1)

Unnamed: 0_level_0,begin_index,begin_timestamp_string,filtered_signal,phase_num,raw_signal_current,raw_signal_voltage,three_first_peaks,three_first_peaks_index,transient_rise_gradient,P
begin_timestamp_float,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1491027000.0,701,01-04-2017_08:01:41_529732,"[7.9862e-06, 0.0259479208, -0.0389019368, 7.97...",2,"[0.6561279297, 0.7019042969, 0.7629394531, 0.7...","[289.647277832, 289.6991882324, 289.8029785156...","[26.198, 4.360, 2.076]","[754, 884, 1015]",245806.997513,"[105.77301788330078, 104.60313415527344, 105.3..."


## Preprossing Data Seb

Combine transients data with power data.

In [10]:
# A unique index is needed in order to assign the new features to each unique transient
data['unique_index'] = range(1, len(data) + 1)
data = data.reset_index()
data = data.set_index('unique_index')


data['charging_status']=0
data['P_mean']=0
data['P_delta'] = 0
data['P_Plugin']=0
data['P_Unplug'] = 0
for row in range(0, len(data)):
    # Getting the mean power before during and after the timestamp (-T, T)
    # Defining charging_status as 1 for avg(P)>300W else 0 and writing it to a new column
    P_mean = sum(data.iloc[row]['P']) / float(len(data.iloc[row]['P']))
    data.set_value(data.index[row],'P_mean', P_mean)
    if P_mean>300:
        data.set_value(data.index[row],'charging_status',int(1))
    else:
       data.set_value(data.index[row],'charging_status', int(0))
     # Defining if a new car is plugged in during the appearance of the transient
    P_mean_before = sum((data.iloc[row]['P'])[:10]) / float(len((data.iloc[row]['P'])[:10])) 
    P_mean_after = sum((data.iloc[row]['P'])[(len(data.iloc[row]['P'])-10):]) / float(len((data.iloc[row]['P'])[(len(data.iloc[row]['P'])-10):])) 
    P_delta = P_mean_after - P_mean_before
    data.set_value(data.index[row],'P_delta',P_delta)
    if P_delta > 2000:
        data.set_value(data.index[row],'P_Plugin',int(1))
    else:
        data.set_value(data.index[row],'P_Plugin',int(0))
    if P_delta < -2000:
        data.set_value(data.index[row],'P_Unplug',int(1))
    else:
        data.set_value(data.index[row],'P_Unplug',int(0))
        

#resetting index to begin_timestamp_float
data = data.set_index('begin_timestamp_float')

In [11]:
data.head()

Unnamed: 0_level_0,begin_index,begin_timestamp_string,filtered_signal,phase_num,raw_signal_current,raw_signal_voltage,three_first_peaks,three_first_peaks_index,transient_rise_gradient,P,charging_status,P_mean,P_delta,P_Plugin,P_Unplug
begin_timestamp_float,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1491027000.0,701,01-04-2017_08:01:41_529732,"[7.9862e-06, 0.0259479208, -0.0389019368, 7.97...",2,"[0.6561279297, 0.7019042969, 0.7629394531, 0.7...","[289.647277832, 289.6991882324, 289.8029785156...","[26.198, 4.360, 2.076]","[754, 884, 1015]",245807.0,"[105.77301788330078, 104.60313415527344, 105.3...",0,105,0,0,0
1491027000.0,806,01-04-2017_08:14:53_521148,"[0.3013579845, 0.27552866940000004, 0.30145740...",2,"[0.732421875, 0.7171630859, 0.7476806641, 0.76...","[289.1282348633, 289.2579956055, 289.335845947...","[21.480, 3.592, 1.415]","[860, 1074, 1427]",195108.6,"[105.6083755493164, 105.91771697998047, 105.88...",0,105,0,0,0
1491073000.0,5717,01-04-2017_20:57:48_799036,"[-0.0624732189, -0.1910434365, 0.0152767645, -...",2,"[-1.2969970703, -1.2512207031, -1.2359619141, ...","[-292.2813720703, -292.3981628418, -292.618743...","[26.096, 4.280, 2.540]","[5769, 5892, 6032]",242317.8,"[107.5003433227539, 107.1847915649414, 107.303...",0,106,-1,0,0
1488462000.0,2750,02-03-2017_14:42:56_465712,"[-0.1300312473, 7.76288e-05, -0.0388483592, 0....",1,"[-2.6092529297000002, -2.6092529297000002, -2....","[0.1690305471, -0.1430258453, -0.3770681620000...","[58.447, 13.698, 33.141]","[2760, 2843, 3400]",-2846197.0,"[3704.182861328125, 3712.759521484375, 3715.73...",1,4850,3551,1,0
1488462000.0,9697,02-03-2017_14:45:31_804547,"[0.142554611, 0.0336101092, 0.1024524048000000...",3,"[-31.0211181641, -31.1126708984, -31.127929687...","[-297.951385498, -297.7366638184, -297.6964111...","[40.280, 21.233, 2.845]","[9709, 9791, 9874]",-1692315.0,"[7219.86328125, 7217.51416015625, 7218.6166992...",1,8313,3503,1,0


## Preprossing Data

The next step is to Preprocessing the Data. In this present case it exist four different events to detect. At first transient cause from electric cars and the second on all from outside. In the following step all transients should be seperate in:

* ```events_from_outside```
* ```transients_1_phase```
* ```transients_2_phase```
* ```transients_3_phase```

This will be save into a pandas dataframe and export as a pkl file.

In [12]:
data.head()

Unnamed: 0_level_0,begin_index,begin_timestamp_string,filtered_signal,phase_num,raw_signal_current,raw_signal_voltage,three_first_peaks,three_first_peaks_index,transient_rise_gradient,P,charging_status,P_mean,P_delta,P_Plugin,P_Unplug
begin_timestamp_float,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1491027000.0,701,01-04-2017_08:01:41_529732,"[7.9862e-06, 0.0259479208, -0.0389019368, 7.97...",2,"[0.6561279297, 0.7019042969, 0.7629394531, 0.7...","[289.647277832, 289.6991882324, 289.8029785156...","[26.198, 4.360, 2.076]","[754, 884, 1015]",245807.0,"[105.77301788330078, 104.60313415527344, 105.3...",0,105,0,0,0
1491027000.0,806,01-04-2017_08:14:53_521148,"[0.3013579845, 0.27552866940000004, 0.30145740...",2,"[0.732421875, 0.7171630859, 0.7476806641, 0.76...","[289.1282348633, 289.2579956055, 289.335845947...","[21.480, 3.592, 1.415]","[860, 1074, 1427]",195108.6,"[105.6083755493164, 105.91771697998047, 105.88...",0,105,0,0,0
1491073000.0,5717,01-04-2017_20:57:48_799036,"[-0.0624732189, -0.1910434365, 0.0152767645, -...",2,"[-1.2969970703, -1.2512207031, -1.2359619141, ...","[-292.2813720703, -292.3981628418, -292.618743...","[26.096, 4.280, 2.540]","[5769, 5892, 6032]",242317.8,"[107.5003433227539, 107.1847915649414, 107.303...",0,106,-1,0,0
1488462000.0,2750,02-03-2017_14:42:56_465712,"[-0.1300312473, 7.76288e-05, -0.0388483592, 0....",1,"[-2.6092529297000002, -2.6092529297000002, -2....","[0.1690305471, -0.1430258453, -0.3770681620000...","[58.447, 13.698, 33.141]","[2760, 2843, 3400]",-2846197.0,"[3704.182861328125, 3712.759521484375, 3715.73...",1,4850,3551,1,0
1488462000.0,9697,02-03-2017_14:45:31_804547,"[0.142554611, 0.0336101092, 0.1024524048000000...",3,"[-31.0211181641, -31.1126708984, -31.127929687...","[-297.951385498, -297.7366638184, -297.6964111...","[40.280, 21.233, 2.845]","[9709, 9791, 9874]",-1692315.0,"[7219.86328125, 7217.51416015625, 7218.6166992...",1,8313,3503,1,0


In [13]:
# import a function that can seperate trasients by event types and extract some features
from SEPERATE_DF import *

In [14]:
%time events_from_outside, transients_1_phase, transients_2_phase, transients_3_phase = seperate_transients(data)

CPU times: user 2.46 s, sys: 17.4 ms, total: 2.48 s
Wall time: 2.49 s


In [15]:
len(events_from_outside)

1390

In [16]:
len(transients_1_phase)

254

In [17]:
len(transients_2_phase)

172

In [18]:
len(transients_3_phase)

66

In [19]:
# export df to csv files
events_from_outside.to_pickle("events_from_outside.pkl")
transients_1_phase.to_pickle("transients_1_phase.pkl")
transients_2_phase.to_pickle("transients_2_phase.pkl")
transients_3_phase.to_pickle("transients_3_phase.pkl")

In [None]:
filtered_signal_1_ph = transients_1_phase["filtered_signal"].apply(pd.Series).fillna(0)

In [None]:
filtered_signal_1_ph_split = filtered_signal_1_ph.iloc[:50,:].transpose()

In [None]:
filtered_signal_1_ph_split.columns = [i for i in range(filtered_signal_1_ph_split.shape[1])]

In [None]:
filtered_signal_1_ph_split.head()

In [None]:
# rearrange voltage column-wise, not row-wise

master_df = pd.DataFrame(filtered_signal_1_ph_split[0])
master_df['id'] = 0

bar = progressbar.ProgressBar()

with progressbar.ProgressBar(max_value=len(filtered_signal_1_ph_split.columns)) as bar:
    for i in range(1,len(filtered_signal_1_ph_split.columns)): #len(df_raw_signal_voltage_t.columns)
        signal_df = pd.DataFrame(filtered_signal_1_ph_split[i])
        signal_df['id'] = i
        master_df = pd.DataFrame(np.vstack([master_df, signal_df]))
        bar.update(i)

In [None]:
master_df.head()

In [None]:
sns.set(style="ticks")

# Initialize a grid of plots with an Axes for each transient
grid = sns.FacetGrid(master_df, col=1, hue=1, col_wrap=2, size=10)

# Draw a line plot to show the trajectory of each random walk
grid.map(plt.plot, 0, ms=4)

In [None]:
len(transients_2_phase)

In [None]:
transients_3_phase

In [None]:
len(transients_3_phase)

### Create Feature Dataframe from exist features

The Problem with time series is, that they are high dimensional. Therefore it's important to extract features from this time series to reduce the dimensions of the data. So the next step is to preprocessing our data, that we can extract features from this Dataframe. We will create a Dataframe for the current/voltage signals.

## Feature for one phase events

In [None]:
from CREATE_DF import *

In [None]:
%time feature_df_with_nan_1 = create_feature_df(transients_1_phase)

In [None]:
# sort df: The code put all NaN values in the last columns of the df
feature_df_with_nan_sort_1 = feature_df_with_nan_1.apply(squeeze_nan, axis=1)
feature_df_with_nan_sort_1.reset_index(level=0, inplace=True)

# get columns name an put them into a list
original_columns = feature_df_with_nan_sort_1.columns.tolist()
# Create new column names
new_columns = original_columns[:4] + original_columns[10:13] + original_columns[19:20] + original_columns[22:23]
# drop all columns with NaN
feature_df_1_phase = feature_df_with_nan_sort_1.dropna(axis=1)
feature_df_1_phase = feature_df_1_phase.apply(pd.to_numeric)
feature_df_1_phase.columns = new_columns

In [None]:
feature_df_1_phase.to_csv("feature_df_1_phase.csv", columns = new_columns, index=None)

## Feature for two phase events

In [None]:
%time feature_df_with_nan_2 = create_feature_df(transients_2_phase)

# sort df: The code put all NaN values in the last columns of the df
feature_df_with_nan_sort_2 = feature_df_with_nan_2.apply(squeeze_nan, axis=1)
feature_df_with_nan_sort_2.reset_index(level=0, inplace=True)
# get columns name an put them into a list
original_columns = feature_df_with_nan_sort_2.columns.tolist()
# Create new column names
new_columns = original_columns[:7] + original_columns[10:16] + original_columns[19:21] + original_columns[22:24]
# drop all columns with NaN
feature_df_2_phase = feature_df_with_nan_sort_2.dropna(axis=1)
feature_df_2_phase = feature_df_2_phase.apply(pd.to_numeric)
feature_df_2_phase.columns = new_columns

In [None]:
feature_df_2_phase.to_csv("feature_df_2_phase.csv", columns = new_columns, index=None)

## Feature for three phase events

In [None]:
%time feature_df_3_phase = create_feature_df(transients_3_phase)

In [None]:
feature_df_3_phase.reset_index(level=0, inplace=True)

In [None]:
feature_df_3_phase.head()

In [None]:
feature_df_3_phase.to_csv("feature_df_3_phase.csv", index=None)

In [None]:
test = pd.read_csv("feature_df_3_phase.csv")

In [None]:
test.head()

# Spielen

In [None]:
Z = linkage(feature_df, 'ward')

In [None]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

c, coph_dists = cophenet(Z, pdist(feature_df))
c

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [None]:
plt.figure(figsize=(10, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
fancy_dendrogram(
    Z,
    truncate_mode='lastp',
    p=7,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,
    max_d=16,
)
plt.show()

In [None]:
# create trasient data frame
raw_signal_current = data["raw_signal_current"]
raw_signal_voltage = data["raw_signal_voltage"]

# create dataframes
# on row contains on time series
# column is time or dimension
df_raw_signal_current = pd.DataFrame([x for x in raw_signal_current])
df_raw_signal_voltage = pd.DataFrame([x for x in raw_signal_voltage])

In [None]:
df_raw_signal_current.head(1)

In [None]:
df_raw_signal_voltage.head(1)

### Clustering

### Extract Features Voltage (mit tsfresh)

Quelle Bild:
https://tsfresh.readthedocs.io/en/latest/_images/feature_extraction_process_20160815_mc_1.png

In [None]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import FeatureExtractionSettings

extraction_settings = FeatureExtractionSettings()
extraction_settings.IMPUTE = impute    # Fill in Infs and NaNs

# transpose since tsfresh reads times series data column-wise, not row-wise
df_raw_signal_voltage_t = df_raw_signal_voltage.copy().transpose()
df_raw_signal_voltage_t_cut = df_raw_signal_voltage_t[:100]

print(df_raw_signal_voltage_t_cut.shape)
len(df_raw_signal_voltage_t_cut.columns)

In [None]:
# rearrange voltage column-wise, not row-wise

master_df = pd.DataFrame(df_raw_signal_voltage_t_cut[0])
master_df['id'] = 0

bar = progressbar.ProgressBar()

with progressbar.ProgressBar(max_value=len(df_raw_signal_voltage_t_cut.columns)) as bar:
    for i in range(1,500): #len(df_raw_signal_voltage_t.columns)
        signal_df = pd.DataFrame(df_raw_signal_voltage_t_cut[i])
        signal_df['id'] = i
        master_df = pd.DataFrame(np.vstack([master_df, signal_df]))
        bar.update(i)

In [None]:
master_df.shape

In [None]:
new_master_df = master_df.dropna(how='any')
new_master_df.shape

In [None]:
%time X = extract_features(new_master_df, column_id=1, feature_extraction_settings=extraction_settings)