<a href="https://colab.research.google.com/github/ReggaeUlli/BipedalWalker-gists/blob/master/Clustering_Feature_space.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Theory:
--

Timeseries clustering is very complex, to reduce the complexity we extract metadata from the invervals and cluster them with the more traditional data point clustering methods

Imports
--

In [122]:
!git clone https://github.com/philippwulff/behaviour_mining.git

fatal: destination path 'behaviour_mining' already exists and is not an empty directory.


In [123]:
import os

import numpy as np
import pandas as pd

from scipy.signal import find_peaks

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Get data

In [124]:
def read_all(folder, nrows=None):
    """
    Read all .csv files from a directory and return the pandas Dataframes.
    """
    dfs = {}

    for root,dirs,files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                path = os.path.join(root, str(file))
                print('Creating DataFrame from {}'.format(path))
                df = pd.read_csv(path, nrows=nrows)
                dfs[path] = df

    return dfs

# Creating the meta data from trajectories

Using these meta data features:

```
    columns = ['mean', 'std', 'median', 'quant_25', 'quant_75', 'loc_max_mean', 
               'loc_max_num', 'loc_min_mean', 'loc_min_num', 'freq_iv', 'strongest_freq']
```



```
# Als Code formatiert
```

Helper functions
--

In [126]:
def split_ser(ser, n=100):
    num_splits = int(len(ser)/n)
    splits = []
    for i in range(num_splits):
        splits.append(ser[i*n:(i+1)*n])

    return splits


def ser_to_meta(x, obs_name=None):
    mean = np.mean(x)
    std = np.std(x)
    median = np.median(sorted(x))
    quant_25 = np.quantile(sorted(x), 0.25)
    quant_75 = np.quantile(sorted(x), 0.75)

    # find all local maxima with a min distance of 10 between them
    loc_max, _ = find_peaks(x, distance=10)
    loc_max_mean = np.mean(x[loc_max])
    loc_max_num = len(loc_max)
    # find all local minima with a min distance of 10 between them
    loc_min, _ = find_peaks(x*-1, distance=10)
    loc_min_mean = np.mean(x[loc_min])
    loc_min_num = len(loc_min)

    # find n_bins strongest present frequencies with fast fourier transform
    n_bins = 5
    fft = np.fft.fft(x)
    T = 1 # sampling interval
    N = len(x)
    f = np.linspace(0, 1 / T, N)
    inds = np.argsort(np.abs(fft)[:N // 2])[-n_bins:] # get indexes of strongest freqs
    inds = inds[np.where(inds!=0)] # ignore the bin at f=0 (if this was a max) because this is the mean
    n_freq = f[inds]

    freq_iv = pd.Interval(min(n_freq), max(n_freq), closed='both')
    strongest_freq = f[inds[-1]] # np.argsort places hightest at the end

    sample = [mean,
              std,
              median,
              quant_25,
              quant_75,
              loc_max_mean,
              loc_max_num,
              loc_min_mean,
              loc_min_num,
              freq_iv,
              strongest_freq]
    columns = ['mean', 'std', 'median', 'quant_25', 'quant_75', 'loc_max_mean', 
               'loc_max_num', 'loc_min_mean', 'loc_min_num', 'freq_iv', 'strongest_freq']

    if obs_name:
        columns = [name+'_'+obs_name for name in columns]

    return sample, columns


def proc_ser(ser, model_name, obs_name=None, n=100):
    splits = split_ser(ser, n)
    metas = []
    columns = None
    for split in splits:
        meta, columns = ser_to_meta(split, obs_name)
        metas.append(meta)
        if not columns:
            columns = columns
    
    df_1 = pd.DataFrame([model_name]*len(metas), columns=['model'])
    df_2 = pd.DataFrame(metas, columns=columns)
    df = pd.concat([df_1,df_2], axis=1)
    return df
  
def get_data(n_interval_lenght=100):
  obs_names = dfs['behaviour_mining/data/BipedalWalker-v3/ppo2/ppo2_BipedalWalker-v3.csv'].columns.values.tolist()
  obs_names.remove("done")
  obs_names.remove("cumulative_reward")
  model_names = [key.split('/')[-2] for key in dfs.keys()]

  df_all = pd.DataFrame()

  for model_df, model_name in zip(dfs.values(), model_names):
    model_meta_df = pd.DataFrame()
    for obs_name in obs_names:
      series = model_df[obs_name].values
      meta_df = proc_ser(series, model_name, obs_name=obs_name)
      if (obs_name!="rewards"):
        meta_df = meta_df.drop(columns="model")
      model_meta_df = pd.concat([model_meta_df, meta_df], axis=1)
    df_all = pd.concat([df_all, model_meta_df], axis=0)
  print("done with importing data")
  print (df_all)
  return df_all

def remove_unstable_columns(df_all): 
  for obs_name in obs_names:
    #min_freqs, max_freqs = [], []
    #for iv in df_all['freq_iv']:
      #min_freqs.append(iv.left)
      #max_freqs.append(iv.right)
    #df_all['min_freq'+'_'+obs_name] = min_freqs
    #df_all['max_freq'+'_'+obs_name] = max_freqs
    df_all = df_all.drop('loc_max_mean'+'_'+obs_name, axis=1)
    df_all = df_all.drop('freq_iv'+'_'+obs_name, axis=1)
    df_all = df_all.drop('loc_min_mean'+'_'+obs_name, axis=1)
  df_all
  return df_all

def cluster(df_all, n_clusters=7):
  df_without_model = df_all.drop(columns="model")
  n_clusters=n_clusters
  km = KMeans(n_clusters=n_clusters)
  km.fit(df_without_model)
  prediction = km.predict(df_without_model)
  occurance_dict={}
  for i in range(n_clusters):
    temp_dict={}
    for name in model_names:
      temp_dict[name]=0
    occurance_dict[i] = temp_dict
  for i in range(len(prediction)):
    occurance_dict[prediction[i]][df_all.iloc[i,0]] = occurance_dict[prediction[i]][df_all.iloc[i,0]]+1
  print("number of intervals that an algorithms occurs in the clusters")
  for i in range(n_clusters):
    print("cluster "+str(i)+": "+str(occurance_dict[i]))

def get_data_and_cluster(n_clusters=7, n_interval_lenght=100):
  df_all = get_data(n_interval_lenght=n_interval_lenght)
  df_all = remove_unstable_columns(df_all)
  cluster(df_all, n_clusters=n_clusters)

Concatenate meta data from all models for one observation to a single DataFrame
--

If the trajectory of a model stays constantly 0, the np.mean() function will impute NaN as the mean value. 

#clustering

In [115]:
get_data_and_cluster(n_clusters=7, n_interval_lenght=100)


Mean of empty slice.


invalid value encountered in double_scalars



done with importing data
     model  ...  strongest_freq_action_3
0    acktr  ...                 0.070707
1    acktr  ...                 0.060606
2    acktr  ...                 0.090909
3    acktr  ...                 0.212121
4    acktr  ...                 0.060606
..     ...  ...                      ...
995    sac  ...                 0.494949
996    sac  ...                 0.494949
997    sac  ...                 0.494949
998    sac  ...                 0.494949
999    sac  ...                 0.494949

[7000 rows x 320 columns]
number of intervals that an algorithms occurs in the clusters
cluster 0: {'acktr': 56, 'td3': 0, 'trpo': 92, 'ppo2': 944, 'a2c': 232, 'ddpg': 0, 'sac': 404}
cluster 1: {'acktr': 0, 'td3': 7, 'trpo': 881, 'ppo2': 0, 'a2c': 7, 'ddpg': 0, 'sac': 409}
cluster 2: {'acktr': 6, 'td3': 5, 'trpo': 9, 'ppo2': 4, 'a2c': 8, 'ddpg': 5, 'sac': 6}
cluster 3: {'acktr': 2, 'td3': 0, 'trpo': 6, 'ppo2': 7, 'a2c': 11, 'ddpg': 193, 'sac': 0}
cluster 4: {'acktr': 933, 'td3'

In [116]:
get_data_and_cluster(n_clusters=7, n_interval_lenght=30)


Mean of empty slice.


invalid value encountered in double_scalars



done with importing data
     model  ...  strongest_freq_action_3
0    acktr  ...                 0.070707
1    acktr  ...                 0.060606
2    acktr  ...                 0.090909
3    acktr  ...                 0.212121
4    acktr  ...                 0.060606
..     ...  ...                      ...
995    sac  ...                 0.494949
996    sac  ...                 0.494949
997    sac  ...                 0.494949
998    sac  ...                 0.494949
999    sac  ...                 0.494949

[7000 rows x 320 columns]
number of intervals that an algorithms occurs in the clusters
cluster 0: {'acktr': 933, 'td3': 0, 'trpo': 0, 'ppo2': 45, 'a2c': 742, 'ddpg': 0, 'sac': 5}
cluster 1: {'acktr': 55, 'td3': 0, 'trpo': 92, 'ppo2': 944, 'a2c': 230, 'ddpg': 0, 'sac': 409}
cluster 2: {'acktr': 6, 'td3': 5, 'trpo': 9, 'ppo2': 4, 'a2c': 8, 'ddpg': 5, 'sac': 6}
cluster 3: {'acktr': 0, 'td3': 0, 'trpo': 0, 'ppo2': 0, 'a2c': 0, 'ddpg': 802, 'sac': 0}
cluster 4: {'acktr': 0, 'td3': 

In [117]:
get_data_and_cluster(n_clusters=7, n_interval_lenght=15)


Mean of empty slice.


invalid value encountered in double_scalars



done with importing data
     model  ...  strongest_freq_action_3
0    acktr  ...                 0.070707
1    acktr  ...                 0.060606
2    acktr  ...                 0.090909
3    acktr  ...                 0.212121
4    acktr  ...                 0.060606
..     ...  ...                      ...
995    sac  ...                 0.494949
996    sac  ...                 0.494949
997    sac  ...                 0.494949
998    sac  ...                 0.494949
999    sac  ...                 0.494949

[7000 rows x 320 columns]
number of intervals that an algorithms occurs in the clusters
cluster 0: {'acktr': 933, 'td3': 0, 'trpo': 0, 'ppo2': 45, 'a2c': 742, 'ddpg': 0, 'sac': 5}
cluster 1: {'acktr': 3, 'td3': 988, 'trpo': 12, 'ppo2': 0, 'a2c': 1, 'ddpg': 0, 'sac': 170}
cluster 2: {'acktr': 6, 'td3': 5, 'trpo': 9, 'ppo2': 4, 'a2c': 8, 'ddpg': 5, 'sac': 6}
cluster 3: {'acktr': 55, 'td3': 0, 'trpo': 92, 'ppo2': 944, 'a2c': 230, 'ddpg': 0, 'sac': 409}
cluster 4: {'acktr': 0, 'td3

In [127]:
get_data_and_cluster(n_clusters=3, n_interval_lenght=100)


Mean of empty slice.


invalid value encountered in double_scalars



done with importing data
     model  ...  strongest_freq_action_3
0    acktr  ...                 0.070707
1    acktr  ...                 0.060606
2    acktr  ...                 0.090909
3    acktr  ...                 0.212121
4    acktr  ...                 0.060606
..     ...  ...                      ...
995    sac  ...                 0.494949
996    sac  ...                 0.494949
997    sac  ...                 0.494949
998    sac  ...                 0.494949
999    sac  ...                 0.494949

[7000 rows x 320 columns]
number of intervals that an algorithms occurs in the clusters
cluster 0: {'acktr': 155, 'td3': 25, 'trpo': 122, 'ppo2': 989, 'a2c': 564, 'ddpg': 163, 'sac': 511}
cluster 1: {'acktr': 839, 'td3': 970, 'trpo': 869, 'ppo2': 7, 'a2c': 428, 'ddpg': 832, 'sac': 483}
cluster 2: {'acktr': 6, 'td3': 5, 'trpo': 9, 'ppo2': 4, 'a2c': 8, 'ddpg': 5, 'sac': 6}


In [128]:
get_data_and_cluster(n_clusters=3, n_interval_lenght=30)


Mean of empty slice.


invalid value encountered in double_scalars



done with importing data
     model  ...  strongest_freq_action_3
0    acktr  ...                 0.070707
1    acktr  ...                 0.060606
2    acktr  ...                 0.090909
3    acktr  ...                 0.212121
4    acktr  ...                 0.060606
..     ...  ...                      ...
995    sac  ...                 0.494949
996    sac  ...                 0.494949
997    sac  ...                 0.494949
998    sac  ...                 0.494949
999    sac  ...                 0.494949

[7000 rows x 320 columns]
number of intervals that an algorithms occurs in the clusters
cluster 0: {'acktr': 6, 'td3': 5, 'trpo': 9, 'ppo2': 4, 'a2c': 8, 'ddpg': 5, 'sac': 6}
cluster 1: {'acktr': 155, 'td3': 25, 'trpo': 122, 'ppo2': 989, 'a2c': 564, 'ddpg': 163, 'sac': 511}
cluster 2: {'acktr': 839, 'td3': 970, 'trpo': 869, 'ppo2': 7, 'a2c': 428, 'ddpg': 832, 'sac': 483}


In [129]:
get_data_and_cluster(n_clusters=3, n_interval_lenght=15)


Mean of empty slice.


invalid value encountered in double_scalars



done with importing data
     model  ...  strongest_freq_action_3
0    acktr  ...                 0.070707
1    acktr  ...                 0.060606
2    acktr  ...                 0.090909
3    acktr  ...                 0.212121
4    acktr  ...                 0.060606
..     ...  ...                      ...
995    sac  ...                 0.494949
996    sac  ...                 0.494949
997    sac  ...                 0.494949
998    sac  ...                 0.494949
999    sac  ...                 0.494949

[7000 rows x 320 columns]
number of intervals that an algorithms occurs in the clusters
cluster 0: {'acktr': 994, 'td3': 995, 'trpo': 988, 'ppo2': 995, 'a2c': 991, 'ddpg': 0, 'sac': 994}
cluster 1: {'acktr': 0, 'td3': 0, 'trpo': 3, 'ppo2': 1, 'a2c': 0, 'ddpg': 995, 'sac': 0}
cluster 2: {'acktr': 6, 'td3': 5, 'trpo': 9, 'ppo2': 4, 'a2c': 9, 'ddpg': 5, 'sac': 6}
