# Imports

In [3]:
import glob
import os
import pandas as pd
import numpy as np
# from scipy.spatial.distance import euclidean
# from tslearn.metrics import dtw_path
# from tslearn.clustering import TimeSeriesKMeans
from statsmodels.tsa.stattools import adfuller
# import evaluate

# Preparing methods

"""
# Function to calculate average Euclidean distance
def average_euclidean_distance(dataframe):
    distances = []
    for i in range(len(dataframe)):
        for j in range(i + 1, len(dataframe)):
            dist = euclidean(dataframe.iloc[i, 1:], dataframe.iloc[j, 1:])  # Skip the label column
            distances.append(dist)
    return np.mean(distances)

# Function to calculate average DTW distance with a learned window
def average_dtw_distance_learned(dataframe):
    model = TimeSeriesKMeans(n_clusters=len(np.unique(dataframe.iloc[:, 0])), metric="dtw", max_iter=10)
    model.fit(dataframe.iloc[:, 1:])  # Use only feature columns
    return model.inertia_

# Function to calculate average DTW distance with a fixed window of 100
def average_dtw_distance_fixed(dataframe, w=100):
    distances = []
    for i in range(len(dataframe)):
        for j in range(i + 1, len(dataframe)):
            path, dist = dtw_path(dataframe.iloc[i, 1:], dataframe.iloc[j, 1:], global_constraint="sakoe_chiba", sakoe_chiba_radius=w)  # Skip the label column
            distances.append(dist)
    return np.mean(distances)
"""

# Generating statistics

In [4]:
# Setup folder path for the prepared UCR datasets (TSV files)
prepared_datasets_folder_path = "Prepared_UCR_Datasets"

# Use glob to get a list of TSV file paths in the folder
tsv_files = glob.glob(os.path.join(prepared_datasets_folder_path, '*.tsv'))

# Initialize an empty DataFrame
columns = ['dataset_name', 'num_instances_train', 'num_instances_test', 'train_test_ratio', 'num_features', 'num_classes', 'label_skew', 'euclidean_distance', 'dynamic_time_warping_learned', 'dynamic_time_warping_fixed', 'default_rate', 'num_missing_values', 'perc_missing_values', 'num_duplicate_instances', 'perc_duplicate_instances', 'avg_autocorrelation']

overview_df = pd.DataFrame(columns=columns)

# Preparing the Huggingface Evaluation of label distribution
#distribution = evaluate.load("label_distribution")

# Loop through each TSV file and read it into a DataFrame - append the name of the file
for tsv_file in tsv_files:
    # Read the TSV file into a DataFrame
    df = pd.read_csv(tsv_file, sep='\t', header=None)

    # Extract the dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(tsv_file))[0]

    # Initialize an empty dictionary to store the statistical information
    stat_info = {}
    
    # Append basic statistical information
    try:
        # dataset name
        stat_info['dataset_name'] = dataset_name
        
        # number of rows/instances
        stat_info['num_instances'] = df.shape[0]

        # number of columns/features; subtract 1 for the class label
        stat_info['num_features'] = df.shape[1] - 1
        
        # select the first column (class label) and get the number of unique values
        stat_info['num_classes'] = len(df.iloc[:, 0].unique())

        # first sum: sum of missing values per column, second sum: sum of all columns
        stat_info['num_missing_values'] = df.isnull().sum().sum()   
        stat_info['perc_missing_values'] = (stat_info['num_missing_values'] / (stat_info['num_instances'] * stat_info['num_features'])) * 100
        
        # number of duplicate instances
        # keep=False marks all duplicates as True, sum() counts the Trues
        stat_info['num_duplicate_instances'] = df.duplicated().sum()
        stat_info['perc_duplicate_instances'] = (stat_info['num_duplicate_instances'] / stat_info['num_instances']) * 100
    except Exception as e:
        print(f"Error while calculating basic statistical information for {str(dataset_name)}: {str(e)}")
    """
    # Append distance information
    try:
        # Euclidean distance
        stat_info['euclidean_distance'] = average_euclidean_distance(df)
        
        # DTW distance with learned window
        stat_info['dynamic_time_warping_learned'] = average_dtw_distance_learned(df)
        
        # DTW distance with fixed window
        stat_info['dynamic_time_warping_fixed'] = average_dtw_distance_fixed(df)
    except Exception as e:
        print(f"Error while calculating distance information for {str(dataset_name)}: {str(e)}")
    """
    """
    # Append label skew information
    try:
        # get values of the first column (class labels)
        label_values = df.iloc[:, 0].values
        result = distribution.compute(data=label_values)

        # Append label skew information
        stat_info['label_skew'] = result['label_skew']
    except Exception as e:
        print(f"Error while calculating label skew for {str(dataset_name)}: {str(e)}")
    
    
    # Append autocorrelation information with try-except blocks
    try:
        stat_info['avg_autocorrelation'] = df.apply(lambda col: col.autocorr()).mean()
    except Exception as e:
        print(f"Error while calculating autocorrelation for {str(dataset_name)}: {str(e)}")
    """
    
    # Append the dictionary to the dataframe
    overview_df = overview_df.append(stat_info, ignore_index=True)
    
overview_df.to_csv('extractedMetrics.csv', index=False)

  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_df.append(stat_info, ignore_index=True)
  overview_df = overview_