# Imports

In [10]:
import glob
import os
import pandas as pd
import evaluate

# Generating statistics

In [None]:
# Setup folder path for the prepared UCR datasets (TSV files)
prepared_datasets_folder_path = "Prepared_UCR_Datasets"

# Use glob to get a list of TSV file paths in the folder
tsv_files = glob.glob(os.path.join(prepared_datasets_folder_path, '*.tsv'))

# Initialize an empty DataFrame
columns = ['dataset_name', 'num_instances_train', 'num_instances_test', 'train_test_ratio', 'num_features', 'num_classes', 'label_skew', 'euclidean_distance', 'dynamic_time_warping_learned', 'dynamic_time_warping_fixed', 'default_rate', 'num_missing_values', 'perc_missing_values', 'num_duplicate_instances', 'perc_duplicate_instances', 'avg_autocorrelation']

overview_df = pd.DataFrame(columns=columns)

# Preparing the Huggingface Evaluation of label distribution
distribution = evaluate.load("label_distribution")

# Loop through each TSV file and read it into a DataFrame - append the name of the file
for tsv_file in tsv_files:
    # Read the TSV file into a DataFrame
    df = pd.read_csv(tsv_file, sep='\t', header=None)

    # Extract the dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(tsv_file))[0]

    # Initialize an empty dictionary to store the statistical information
    stat_info = {}
    
    # Append basic statistical information
    try:
        # dataset name
        stat_info['dataset_name'] = dataset_name
        
        # number of features
        stat_info['num_features'] = df.shape[1] - 1
        
        # first sum: sum of missing values per column, second sum: sum of all columns
        stat_info['num_missing_values'] = df.isnull().sum().sum()   
        
        # number of duplicate instances
        # keep=False marks all duplicates as True, sum() counts the Trues
        stat_info['num_duplicate_instances'] = df.duplicated().sum()
    except Exception as e:
        print(f"Error while calculating basic statistical information for {str(dataset_name)}: {str(e)}")

    # Append label skew information
    try:
        # get values of the first column (class labels)
        label_values = df.iloc[:, 0].values
        result = distribution.compute(data=label_values)

        # Append label skew information
        stat_info['label_skew'] = result['label_skew']
    except Exception as e:
        print(f"Error while calculating label skew for {str(dataset_name)}: {str(e)}")
    
    
    # Append autocorrelation information with try-except blocks
    try:
        stat_info['avg_autocorrelation'] = df.apply(lambda col: col.autocorr()).mean()
    except Exception as e:
        print(f"Error while calculating autocorrelation for {str(dataset_name)}: {str(e)}")
            
    # Turn the dictionary into a DataFrame
    stat_info_df = pd.DataFrame([stat_info])
    
    # Append the statistical information to the DataFrame
    overview_df = pd.concat([overview_df, stat_info_df], ignore_index=True)
    
overview_df.to_csv('extractedMetrics.csv', index=False)