# Przygotowanie zbiorów danych testowych i treningowych

Notatnik przygotowujący danie z folderu Oraginzed do eksperymentów. Dzieli on zbiór na testowy i treningowy i zapisuje w formacie, który jest gotowy do odczytu przez wybraną biblioteke sktime.
Dodatkowo zapisuje oryginalny zbiór danych w csv do eksperymentów związanych z badaniem podobieństwa.

## Przydatne funkcje i importy bibliotek

In [50]:
import random
import pandas as pd
import numpy as np
from os.path import exists
from os import makedirs
from sklearn.model_selection import train_test_split
from sktime.datatypes._panel._convert import (
    from_multi_index_to_nested,
)
DATA_PATH = "MachineLearning/ts_datasets"
seed = 42
udf_types = ['aggregation', 'filtration', 'filtration-aggregation', 'filtration-aggregation-join', 'filtration-join']
samples = { 
    "aggregation" : 300,
    "filtration" : 150,
    "filtration-aggregation": 300,
    "filtration-aggregation-join": 300,
    "filtration-join": 300
    }
    
def remap_labels(label):
    """Mapuje ciąg znaków na liczby dla etykiet

    Args:
        label string: etykieta do zmapowania

    Returns:
        int: zmapowane nazwy funkcji na liczby:
            0 - aggregation
            1 - filtration
            2 - filtration-aggregation
            3 - filtration-aggregation-join
            4 - filtration-join
    """
    if label == "aggregation":
        return 0
    elif label == "filtration":
        return 1
    elif label == "filtration-aggregation":
        return 2
    elif label == "filtration-aggregation-join":
        return 3
    else:
        return 4

def grow_snapshots(snapshot, label, max_snapshots):
    """Inkrementuje snapshoty, w celu połączenia ich w rosnący ciąg

    Args:
        snapshot int: oryginalny snapshot
        label str: etykieta od snapshotu
        max_snapshots int: najwiekszy znapshot

    Returns:
        int: Powiększony snapshot.
    """
    if label == "aggregation":
        return snapshot
    elif label == "filtration":
        return snapshot + max_snapshots
    elif label == "filtration-aggregation":
        return snapshot + 2*max_snapshots
    elif label == "filtration-aggregation-join":
        return snapshot + 3*max_snapshots
    else:
        return snapshot + 4*max_snapshots

def read_dataset_for_sktime_udf(udf_types, organised_directory, num_of_samples = 0, include_RAM = True, file_name = "Time_series_udf_dataset.csv", seed = 42):
    """Odczytuje plik z danymi i tworzy odpowiedni format do zapisu jako dataset

    Args:
        udf_types list: lista typów udf
        organised_directory string: Nazwa folderu Organised
        num_of_samples (int, optional): wielkość datasetów. Domyślnie 0.
        include_RAM (bool, optional): czy bierzemy pod uwagę ram. Domyślnie True.
        file_name (str, optional): nazwa pliku źródłowego. Domyślnie "Time_series_udf_dataset.csv".
        seed (int, optional): stan losowosci. Domyślnie 42.

    Returns:
        tuple: dataset, etykiety datasetu, oryginalny format do zapisu
    """
    random.seed(seed)
    full_df = pd.read_csv(f"./../{organised_directory}/{file_name}")
    max_snapshot = full_df["snapshot"].max()
    if num_of_samples > 0:
        result_df = pd.DataFrame()
        for udf_type in udf_types:
            partial_df = full_df.loc[full_df.label == udf_type].copy()
            samples = random.sample(range(partial_df.snapshot.max()), num_of_samples)
            test_df = partial_df.loc[partial_df.snapshot.isin(samples)].copy()
            result_df = pd.concat([result_df, test_df])
        full_df = result_df

    ts_y = full_df[full_df.epoch == 0.0].label.apply(remap_labels).to_numpy()
    
    labels = full_df[full_df.epoch == 0.0].label.to_numpy()
    udf = full_df[full_df.epoch == 0.0].udf.to_numpy()
    size = full_df[full_df.epoch == 0.0][["size"]].to_numpy()

    full_df["snapshot"] = full_df.apply(lambda x: grow_snapshots(x.snapshot, x.label, max_snapshot), axis=1)


    df = full_df.set_index(["snapshot", full_df.groupby("snapshot").cumcount()])
    index = pd.MultiIndex.from_product(df.index.levels, names=df.index.names)
    output = df.reindex(index, fill_value=0).reset_index(level=1, drop=True).reset_index()
    original_df = output.copy()

    output["row_number"] = output.groupby("snapshot").cumcount()
    
    if include_RAM:
        ts_x = output[["snapshot", "row_number", "CPU", "RAM"]].set_index(["snapshot", "row_number"])
    else:
        ts_x = output[["snapshot", "row_number", "CPU"]].set_index(["snapshot", "row_number"])

    original_df = ts_x.copy()

    max_length = original_df.groupby("snapshot")["CPU"].count().max()
    original_df["label"] = np.repeat(labels, max_length)
    original_df["udf"] = np.repeat(udf, max_length)
    original_df["size"] = np.repeat(size, max_length)

    return from_multi_index_to_nested(ts_x), ts_y, original_df.reset_index()[["snapshot", "label", "udf", "row_number", "CPU", "RAM", "size"]]

def read_dataset_for_sktime_size(udf_type, organised_directory, num_of_samples = 0, include_RAM = True, file_name = "Time_series_udf_dataset.csv", seed = 42):
    """Odczytuje plik z danymi i tworzy odpowiedni format do zapisu jako dataset od eksperymentów wolumenów danych

    Args:
        udf_type str: nazwa typu
        organised_directory str: Nazwa folderu Organised
        num_of_samples (int, optional): wielkość datasetów. Domyślnie 0.
        include_RAM (bool, optional): czy bierzemy pod uwagę ram. Domyślnie True.
        file_name (str, optional): nazwa pliku źródłowego. Domyślnie "Time_series_udf_dataset.csv".
        seed (int, optional): stan losowosci. Domyślnie 42.

    Returns:
        tuple: dataset, etykiety datasetu, oryginalny format do zapisu
    """
    random.seed(seed)
    full_df = pd.read_csv(f"./../{organised_directory}/{file_name}")
    type_df = full_df.loc[full_df.label == udf_type].copy()

    sizes = [1,2]
    if num_of_samples > 0:
        result_df = pd.DataFrame()
        for size in sizes:
            partial_df = type_df.loc[type_df["size"] == size].copy()
            samples = random.sample(list(set(partial_df.snapshot.to_numpy())), num_of_samples)
            test_df = partial_df.loc[partial_df.snapshot.isin(samples)].copy()
            result_df = pd.concat([result_df, test_df])
        full_df = result_df
    
    ts_y = full_df[full_df.epoch == 0.0]["size"].to_numpy()

    labels = full_df[full_df.epoch == 0.0].label.to_numpy()
    udf = full_df[full_df.epoch == 0.0].udf.to_numpy()
    size = full_df[full_df.epoch == 0.0][["size"]].to_numpy()

    df = full_df.set_index(["snapshot", full_df.groupby("snapshot").cumcount()])
    index = pd.MultiIndex.from_product(df.index.levels, names=df.index.names)
    output = df.reindex(index, fill_value=0).reset_index(level=1, drop=True).reset_index()
    output["row_number"] = output.groupby("snapshot").cumcount()

    if include_RAM:
        ts_x = output[["snapshot", "row_number", "CPU", "RAM"]].set_index(["snapshot", "row_number"])
    else:
        ts_x = output[["snapshot", "row_number", "CPU"]].set_index(["snapshot", "row_number"])
        
    original_df = ts_x.copy()

    max_length = original_df.groupby("snapshot")["CPU"].count().max()
    original_df["label"] = np.repeat(labels, max_length)
    original_df["udf"] = np.repeat(udf, max_length)
    original_df["size"] = np.repeat(size, max_length)

    return from_multi_index_to_nested(ts_x), ts_y, original_df.reset_index()[["snapshot", "label", "udf", "row_number", "CPU", "RAM", "size"]]


def write_ts_file(base_path, dataset_name, possible_labels, X_train, X_test, y_train, y_test, base_df):
    """Zapisuje plik do formatu ts.

    Args:
        base_path str: bazowa ścieżka do pliku datasetu
        dataset_name str: nazwa datasetu
        possible_labels str: możliwe etykiety
        X_train pandas.DataFrame: X dla zbioru treningowego
        X_test pandas.DataFrame: X dla zbioru testowego
        y_train numpy.array: y dla zbioru treningowego
        y_test numpy.array: y dla zbioru testowego
    """
    if not exists(f"./../{base_path}/{dataset_name}"):
        makedirs(f"./../{base_path}/{dataset_name}")
    text_file = open(f"./../{base_path}/{dataset_name}/{dataset_name}_TRAIN.ts", "w")
    header = f"@problemName {dataset_name}\n@timeStamps false\n@classLabel true {possible_labels}\n@univariate false\n@data\n"
    text_file.write(header)
    i = 0
    for _, dataset in X_train.iterrows():
        ram = ','.join(str(e) for e in dataset.RAM.to_list())
        cpu = ','.join(str(e) for e in dataset.CPU.to_list())
        label = y_train[i]
        i += 1
        output = cpu + ':' + ram + ':' + str(label) + "\n"
        text_file.write(output)
    text_file.close()

    text_file = open(f"./../{base_path}/{dataset_name}/{dataset_name}_TEST.ts", "w")
    text_file.write(header)
    i = 0
    for _, dataset in X_test.iterrows():
        ram = ','.join(str(e) for e in dataset.RAM.to_list())
        cpu = ','.join(str(e) for e in dataset.CPU.to_list())
        label = y_test[i]
        i += 1
        output = cpu + ':' + ram + ':' + str(label) + "\n"
        text_file.write(output)
    text_file.close()
    base_df.loc[base_df["snapshot"].isin(X_train.index)].copy().to_csv(f"./../{base_path}/{dataset_name}/{dataset_name}_TRAIN.csv", index = False)
    base_df.loc[base_df["snapshot"].isin(X_test.index)].copy().to_csv(f"./../{base_path}/{dataset_name}/{dataset_name}_TEST.csv", index = False)

## Stworzenie zbiorów danych w formacie ts i csv
Domyślnie ustawiono liczbe próbek na 300, aby zachować balans danych (filtracja ma tylko 300 próbek)

Wszystkie pliki lądują w folderze MachineLearning/ts_datasets/<nazwa_zbioru> z podziałem na test i train.

### Default - zbiór danych z pliku Time_series_udf_dataset.csv, który zawiera nieprzetworzone dane.

In [31]:
df_x, df_y, original_df = read_dataset_for_sktime_udf(udf_types, "Organised", num_of_samples = 300, file_name = "Time_series_udf_dataset.csv", seed = seed)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

write_ts_file(DATA_PATH, "Default", set(df_y), X_train, X_test, y_train, y_test, original_df)

### Normalized - zbiór danych z pliku Time_series_udf_dataset_normalized.csv, który zawiera dane po normalizacji.

In [32]:
df_x, df_y, original_df = read_dataset_for_sktime_udf(udf_types, "Organised", num_of_samples = 300, file_name = "Time_series_udf_dataset_normalized.csv", seed = seed)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

write_ts_file(DATA_PATH, "Normalized", set(df_y), X_train, X_test, y_train, y_test, original_df)

### Default_smooth - zbiór danych z pliku Time_series_udf_dataset_smooth_6.csv, który zawiera wygładzone nieprzetworzone dane.

In [33]:
df_x, df_y, original_df = read_dataset_for_sktime_udf(udf_types, "Organised", num_of_samples = 300, file_name = "Time_series_udf_dataset_smooth_6.csv", seed = seed)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

write_ts_file(DATA_PATH, "Default_smooth", set(df_y), X_train, X_test, y_train, y_test, original_df)

### Normalized_smooth - zbiór danych z pliku Time_series_udf_dataset_normalized_smooth_6.csv, który zawiera wygładzone znormalizowane dane.

In [34]:
df_x, df_y, original_df = read_dataset_for_sktime_udf(udf_types, "Organised", num_of_samples = 300, file_name = "Time_series_udf_dataset_normalized_smooth_6.csv", seed = seed)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

write_ts_file(DATA_PATH, "Normalized_smooth", set(df_y), X_train, X_test, y_train, y_test, original_df)

## Dane do eksperymentów związanyc z wolumenem danych

### Default

In [46]:
for udf_type in udf_types:
    df_x, df_y, original_df  = read_dataset_for_sktime_size(udf_type, "Organised", num_of_samples = samples[udf_type], file_name = "Time_series_udf_dataset.csv", seed = seed)
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

    write_ts_file(f"{DATA_PATH}/{udf_type}", "Default", set(df_y), X_train, X_test, y_train, y_test, original_df)

### Normalized

In [47]:
for udf_type in udf_types:
    df_x, df_y, original_df  = read_dataset_for_sktime_size(udf_type, "Organised", num_of_samples = samples[udf_type], file_name = "Time_series_udf_dataset_normalized.csv", seed = seed)
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

    write_ts_file(f"{DATA_PATH}/{udf_type}", "Normalized", set(df_y), X_train, X_test, y_train, y_test, original_df)

### Default_smooth

In [48]:
for udf_type in udf_types:
    df_x, df_y, original_df  = read_dataset_for_sktime_size(udf_type, "Organised", num_of_samples = samples[udf_type], file_name = "Time_series_udf_dataset_smooth_6.csv", seed = seed)
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

    write_ts_file(f"{DATA_PATH}/{udf_type}", "Default_smooth", set(df_y), X_train, X_test, y_train, y_test, original_df)

### Normalized_smooth

In [49]:
for udf_type in udf_types:
    df_x, df_y, original_df  = read_dataset_for_sktime_size(udf_type, "Organised", num_of_samples = samples[udf_type], file_name = "Time_series_udf_dataset_normalized_smooth_6.csv", seed = seed)
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=seed)

    write_ts_file(f"{DATA_PATH}/{udf_type}", "Normalized_smooth", set(df_y), X_train, X_test, y_train, y_test, original_df)