# Notatnik obliczający odległości między szeregami (euklides, dtw, lcss)

### Import bibliotek

In [15]:
from scipy.spatial import distance
import seaborn
import pandas as pd
from dtw import *
from tslearn import metrics
from tssearch.search.query_search import time_series_search
import warnings
warnings.filterwarnings('ignore')
import copy
import os

In [16]:
path = "../SimilaritiesData/"
if not os.path.exists(path):
  os.makedirs(path)

### Wywołanie algorytmów dla odległości euklidesowej, dtw oraz lcss

In [2]:
def calculate_euclidean_distance(a_cpu, b_cpu, a_ram, b_ram):
    cpu_dst = distance.euclidean(a_cpu, b_cpu)
    ram_dst = distance.euclidean(a_ram, b_ram)
    return cpu_dst, ram_dst

def calculate_dtw_dtw_library(a_cpu, b_cpu, a_ram, b_ram):
    step_pattern = symmetric2

    alignment_cpu = dtw(a_cpu, b_cpu, step_pattern=step_pattern, keep_internals=True)
    alignment_ram = dtw(a_ram, b_ram, step_pattern=step_pattern, keep_internals=True)
    return alignment_cpu.normalizedDistance, alignment_cpu.distance, alignment_ram.normalizedDistance, alignment_ram.distance

#def calculate_dtw_dtaidistance_library(a_cpu, b_cpu, a_ram, b_ram):
    #d_cpu = dd.dtw.distance(numpy.array(a_cpu), numpy.array(b_cpu), use_c=True, use_pruning=True)
    #d_ram = dd.dtw.distance(numpy.array(a_ram), numpy.array(b_ram))
    #return d_cpu, d_ram

def calculate_lcss(a_cpu, b_cpu, a_ram, b_ram):
    lcss_path_cpu, sim_lcss_cpu = metrics.lcss_path(a_cpu, b_cpu, eps=2.0)
    lcss_path_ram, sim_lcss_ram = metrics.lcss_path(a_ram, b_ram, eps=1.5)
    return sim_lcss_cpu, sim_lcss_ram

def calculate_lcss2(a_cpu, b_cpu, a_ram, b_ram):
    dict_distances = {
        "elastic": {"Longest Common Subsequence": {
            "multivariate": "yes",
            "description": "",
            "function": "lcss",
            "parameters": {"eps": 1.5, "report": "distance"},
            "use": "yes"}
        }
    }
    sim_lcss_cpu = time_series_search(dict_distances, numpy.array(a_cpu), numpy.array(b_cpu), output=("number", 1))["Longest Common Subsequence"]["distance"]
    sim_lcss_ram = time_series_search(dict_distances, numpy.array(a_ram), numpy.array(b_ram), output=("number", 1))["Longest Common Subsequence"]["distance"]
    return sim_lcss_cpu, sim_lcss_ram

### Wygenerowanie plików csv z obliczonymi odleglościami

In [3]:
columns = ['dataType', 'function1', 'snapshot1','function2', 'snapshot2', 'euclidean',
           'dtw-python-distance', 'dtw-python-normalizedDistance', 'lcss_similarity', 'size1', 'size2', 'udf1', 'udf2']

In [4]:
def calculate_distance_and_save_as_df(df, data, data2):
    a_cpu = list(data['CPU'])
    b_cpu = list(data2['CPU'])
    a_ram = list(data['RAM'])
    b_ram = list(data2['RAM'])

    euc_cpu_dist, euc_ram_dist = calculate_euclidean_distance(a_cpu, b_cpu, a_ram, b_ram)
    dtw_cpu_dist_normalized, dtw_cpu_dist, dtw_ram_dist_normalized, dtw_ram_dist = \
        calculate_dtw_dtw_library(a_cpu, b_cpu, a_ram, b_ram)
    #dtaidistance_cpu, dtaidistance_ram = calculate_dtw_dtaidistance_library(a_cpu, b_cpu, a_ram, b_ram)
    lcss_distance_cpu, lcss_distance_ram = calculate_lcss(a_cpu, b_cpu, a_ram, b_ram)
    new_cpu_row = {
        'dataType' : 'CPU',
        'function1' : data['label'].max(),
        'snapshot1' : data['snapshot'].max(),
        'udf1' : data['udf'].max(),
        'function2' : data2['label'].max(),
        'snapshot2': data2['snapshot'].max(),
        'udf2' : data2['udf'].max(),
        'euclidean': euc_cpu_dist,
        'dtw-python-distance': dtw_cpu_dist,
        'dtw-python-normalizedDistance': dtw_ram_dist_normalized,
        #'dtaidistance': dtaidistance_cpu,
        'lcss_similarity': lcss_distance_cpu,
        'size1': data['size'].max(),
        'size2': data2['size'].max(),
    }
    new_ram_row = {
        'dataType' : 'RAM',
        'function1' : data['label'].max(),
        'snapshot1' : data['snapshot'].max(),
        'udf1' : data['udf'].max(),
        'function2' : data2['label'].max(),
        'snapshot2': data2['snapshot'].max(),
        'udf2' : data2['udf'].max(),
        'euclidean': euc_ram_dist,
        'dtw-python-distance': dtw_ram_dist,
        'dtw-python-normalizedDistance': dtw_ram_dist_normalized,
        #'dtaidistance': dtaidistance_ram,
        'lcss_similarity': lcss_distance_ram,
        'size1': data['size'].max(),
        'size2': data2['size'].max(),
    }
    df = df.append(new_cpu_row, ignore_index = True)
    df = df.append(new_ram_row, ignore_index = True)
    return df

In [5]:
def create_file_with_distances(data_path, result_path):
    full_df = pd.read_csv(data_path)
    full_df_snapshots = full_df.groupby('snapshot')[['snapshot']].max().reset_index(drop=True)
    data_frame2 = pd.DataFrame([], columns = columns)
    data_frame2.to_csv(result_path, index=False)

    for i in full_df_snapshots.index:
        data_frame2 = pd.DataFrame([], columns = columns)
        s1 = full_df_snapshots.loc[i]['snapshot']
        data_1 = full_df[full_df["snapshot"]==s1]
        for j in full_df_snapshots.index:
            if j < i: continue;
            s2 = full_df_snapshots.loc[j]['snapshot']
            data_2 = full_df[full_df["snapshot"]==s2]
            data_frame2 = calculate_distance_and_save_as_df(data_frame2, data_1, data_2)
        data_frame2.to_csv(result_path, mode='a', header=False)
        print(f"Przetworzono {i}")

In [None]:
create_file_with_distances("../MachineLearning/ts_datasets/Default/Default_TEST.csv",
                           f"{path}comparison_distance_all.csv")

In [None]:
create_file_with_distances("../MachineLearning/ts_datasets/Default_smooth/Default_smooth_TEST.csv",
                           f"{path}smooth_comparison_distance_all.csv")

In [None]:
create_file_with_distances("../MachineLearning/ts_datasets/Normalized/Normalized_TEST.csv",
                           f"{path}normalized_comparison_distance_all.csv")

In [None]:
create_file_with_distances("../MachineLearning/ts_datasets/Normalized_smooth/Normalized_smooth_TEST.csv",
                           f"{path}normalized_smooth_comparison_distance_all.csv")

### Wypełnienie drugiej połowy macierzy zawierającej odległości

In [12]:
def fill_second_half_of_matrix(data_path):
    distance_df = pd.read_csv(data_path)
    distance_df.reset_index(drop=True, inplace=True)
    data_frame = copy.deepcopy(distance_df)
    for index, row in distance_df.iterrows():
        if row['snapshot1'] == row['snapshot2']: continue
        new_row = copy.deepcopy(row)
        new_row['function1'] = row['function2']
        new_row['function2'] = row['function1']
        new_row['udf1'] = row['udf2']
        new_row['udf2'] = row['udf1']
        new_row['snapshot1'] = row['snapshot2']
        new_row['snapshot2'] = row['snapshot1']
        new_row['size1'] = row['size2']
        new_row['size2'] = row['size1']
        data_frame = data_frame.append(new_row, ignore_index = True)
        print(index)
    data_frame.to_csv(data_path)

In [None]:
fill_second_half_of_matrix(f"{path}comparison_distance_all.csv")
fill_second_half_of_matrix(f"{path}smooth_comparison_distance_all.csv")
fill_second_half_of_matrix(f"{path}normalized_comparison_distance_all.csv")
fill_second_half_of_matrix(f"{path}normalized_smooth_comparison_distance_all.csv")