# Imports

In [15]:
import glob
import os
import pandas as pd
import numpy as np
import evaluate
from scipy.stats import kstest

# Prepare function to calculate autocorrelation

In [16]:
def autocorrelation_per_instance(series):
    """Calculates autocorrelation for a given time series."""
    n = len(series)
    if n < 2:  # If there's only one data point, autocorrelation is not defined
        return None
    return series.autocorr()

# Generating statistics

In [None]:
# Setup folder path for the UCR datasets (TSV files)
prepared_datasets_folder_path = "Prepared_UCR_Datasets"
unconcatenated_datasets_folder_path = "UCR_Datasets"

# Data sets with varying length
varying_length_datasets = ["AllGestureWiimoteX", "AllGestureWiimoteY", "AllGestureWiimoteZ", "GestureMidAirD1", "GestureMidAirD2", "GestureMidAirD3", "GesturePebbleZ1", "GesturePebbleZ2",
"PickupGestureWiimoteZ", "PLAID", "ShakeGestureWiimoteZ"]

# Use glob to get a list of TSV file paths in the folder
tsv_files = glob.glob(os.path.join(prepared_datasets_folder_path, '*.tsv'))

# Initialize an empty DataFrame
columns = ['dataset_name', 'num_instances_train', 'num_instances_test', 'train_test_ratio', 'num_time_points', 'num_classes', 'class_distribution_bias', 'default_rate', 'num_missing_values', 'null_rate', 'num_duplicate_instances', 'data_repetition_rate', 'avg_autocorrelation', 'ks_statistic', 'ks_p_value']

overview_df = pd.DataFrame(columns=columns)

# Preparing the Huggingface Evaluation of label distribution
distribution = evaluate.load("label_distribution")

# Loop through each TSV file and read it into a DataFrame - append the name of the file
for tsv_file in tsv_files:
    # Read the TSV file into a DataFrame
    df = pd.read_csv(tsv_file, sep='\t', header=None)

    # Extract the dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(tsv_file))[0]

    # Initialize an empty dictionary to store the statistical information
    stat_info = {}
    
    # Append number of instances in train and test set, train-test ratio
    try:
        # Paths for TRAIN and TEST datasets
        train_file_path = os.path.join(unconcatenated_datasets_folder_path, f"{dataset_name}_TRAIN.tsv")
        test_file_path = os.path.join(unconcatenated_datasets_folder_path, f"{dataset_name}_TEST.tsv")

        # Read TRAIN and TEST datasets
        train_df = pd.read_csv(train_file_path, sep='\t', header=None)
        test_df = pd.read_csv(test_file_path, sep='\t', header=None)

        # Number of instances in TRAIN and TEST datasets
        # -1 if dataset has varying length (drawn from comparing with values at UCR Archive website)
        if dataset_name in varying_length_datasets:
            stat_info['num_instances_train'] = train_df.shape[0] - 1
            stat_info['num_instances_test'] = test_df.shape[0] - 1
        else:
            stat_info['num_instances_train'] = train_df.shape[0]
            stat_info['num_instances_test'] = test_df.shape[0]

        # Calculate train-test ratio
        stat_info['train_test_ratio'] = round(stat_info['num_instances_train'] / stat_info['num_instances_test'], 2)

    except Exception as e:
        print(f"Error while reading TRAIN/TEST files for {dataset_name}: {str(e)}")
    
    # Append basic statistical information
    try:
        # dataset name
        stat_info['dataset_name'] = dataset_name
        
        # number of features
        stat_info['num_time_points'] = df.shape[1] - 1
        
        # number of classes
        # -1 if dataset has varying length (drawn from comparing with values at UCR Archive website)
        if dataset_name in varying_length_datasets:
            stat_info['num_classes'] = len(df.iloc[1:, 0].unique()) - 1
        else:
            stat_info['num_classes'] = len(df.iloc[:, 0].unique())
        
        # first sum: sum of missing values per column, second sum: sum of all columns
        stat_info['num_missing_values'] = df.isnull().sum().sum() 
        
        # percentage of missing values
        # total values (exlude label column) - missing values / total values
        total_values = df.shape[0] * df.shape[1] - df.shape[0]
        stat_info['null_rate'] = round(stat_info['num_missing_values'] / total_values, 4)
        
        # number of duplicate instances
        # keep=False marks all duplicates as True, sum() counts the Trues
        stat_info['num_duplicate_instances'] = df.duplicated().sum()
        
        # percentage of duplicate instances
        total_instances = df.shape[0]
        stat_info['data_repetition_rate'] = round(stat_info['num_duplicate_instances'] / total_instances, 4)
    except Exception as e:
        print(f"Error while calculating basic statistical information for {str(dataset_name)}: {str(e)}")

    # Append label skew information
    try:
        # get values of the first column (class labels)
        label_values = df.iloc[:, 0].values
        result = distribution.compute(data=label_values)

        # Append label skew information
        stat_info['class_distribution_bias'] = round(result['label_skew'], 4)
    except Exception as e:
        print(f"Error while calculating class imbalance for {str(dataset_name)}: {str(e)}")
        
    # Append default rate
    try:
        # Total number of instances
        total_instances = df.shape[0]
        # Most common class
        most_common_class = df[df.columns[0]].value_counts().idxmax()
        # Count of instances in the most common class
        count_most_common_class = df[df[df.columns[0]] == most_common_class].shape[0]
        # Default error rate: (Total instances - Instances of most common class) / Total instances
        stat_info['default_rate'] = round((count_most_common_class / total_instances), 4)
    
    except Exception as e:
        print(f"Error while calculating default rate for {dataset_name}: {str(e)}")
    
    # Append autocorrelation information with try-except blocks
    try:
        time_series_data = df.iloc[:, 1:]  # Remaining columns as time series data
        autocorrelations = time_series_data.apply(autocorrelation_per_instance, axis=1)
        
        stat_info['avg_autocorrelation'] = round(autocorrelations.mean(), 4)
    except Exception as e:
        print(f"Error while calculating autocorrelation for {str(dataset_name)}: {str(e)}")

    # Perform KS test against a normal distribution
    try:
        # Flatten the dataset (excluding the label column)
        flattened_data = df.iloc[:, 1:].values.flatten()
    
        # Generating a normal distribution with the same mean and std as the dataset
        norm_dist = np.random.normal(np.mean(flattened_data), np.std(flattened_data), len(flattened_data))

        # KS test
        ks_statistic, ks_p_value = kstest(flattened_data, norm_dist)

        # Add KS statistic and p-value to stat_info
        stat_info['ks_statistic'] = round(ks_statistic, 4) # round to 4 digits to avoid scientific notation
        stat_info['ks_p_value'] = round(ks_p_value, 4)
    except Exception as e:
        print(f"Error while calculating KS statistic for {str(dataset_name)}: {str(e)}")
    
    # Turn the dictionary into a DataFrame
    stat_info_df = pd.DataFrame([stat_info])
    
    # Append the statistical information to the DataFrame
    overview_df = pd.concat([overview_df, stat_info_df], ignore_index=True)
    
    print(f"Finished calculating statistics for {str(dataset_name)}")
    
overview_df.to_csv('extractedMetrics.csv', index=False)