# Imports

In [18]:
import glob
import os
import pandas as pd
import numpy as np
import evaluate
from scipy.stats import kstest

# Generating statistics

In [19]:
# Setup folder path for the UCR datasets (TSV files)
prepared_datasets_folder_path = "Prepared_UCR_Datasets"
unconcatenated_datasets_folder_path = "UCR_Datasets"

# Data sets with varying length
varying_length_datasets = ["AllGestureWiimoteX", "AllGestureWiimoteY", "AllGestureWiimoteZ", "GestureMidAirD1", "GestureMidAirD2", "GestureMidAirD3", "GesturePebbleZ1", "GesturePebbleZ2",
"PickupGestureWiimoteZ", "PLAID", "ShakeGestureWiimoteZ"]

# Use glob to get a list of TSV file paths in the folder
tsv_files = glob.glob(os.path.join(prepared_datasets_folder_path, '*.tsv'))

# Initialize an empty DataFrame
columns = ['dataset_name', 'num_instances_train', 'num_instances_test', 'train_test_ratio', 'num_features', 'num_classes', 'class_imbalance', 'default_rate', 'num_missing_values', 'perc_missing_values', 'num_duplicate_instances', 'perc_duplicate_instances', 'avg_autocorrelation', 'ks_statistic', 'ks_p_value']

overview_df = pd.DataFrame(columns=columns)

# Preparing the Huggingface Evaluation of label distribution
distribution = evaluate.load("label_distribution")

# Loop through each TSV file and read it into a DataFrame - append the name of the file
for tsv_file in tsv_files:
    # Read the TSV file into a DataFrame
    df = pd.read_csv(tsv_file, sep='\t', header=None)

    # Extract the dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(tsv_file))[0]

    # Initialize an empty dictionary to store the statistical information
    stat_info = {}
    
    # Append number of instances in train and test set, train-test ratio
    try:
        # Paths for TRAIN and TEST datasets
        train_file_path = os.path.join(unconcatenated_datasets_folder_path, f"{dataset_name}_TRAIN.tsv")
        test_file_path = os.path.join(unconcatenated_datasets_folder_path, f"{dataset_name}_TEST.tsv")

        # Read TRAIN and TEST datasets
        train_df = pd.read_csv(train_file_path, sep='\t', header=None)
        test_df = pd.read_csv(test_file_path, sep='\t', header=None)

        # Number of instances in TRAIN and TEST datasets
        # -1 if dataset has varying length (drawn from comparing with values at UCR Archive website)
        if dataset_name in varying_length_datasets:
            stat_info['num_instances_train'] = train_df.shape[0] - 1
            stat_info['num_instances_test'] = test_df.shape[0] - 1
        else:
            stat_info['num_instances_train'] = train_df.shape[0]
            stat_info['num_instances_test'] = test_df.shape[0]

        # Calculate train-test ratio
        stat_info['train_test_ratio'] = round(stat_info['num_instances_train'] / stat_info['num_instances_test'], 2)

    except Exception as e:
        print(f"Error while reading TRAIN/TEST files for {dataset_name}: {str(e)}")
    
    # Append basic statistical information
    try:
        # dataset name
        stat_info['dataset_name'] = dataset_name
        
        # number of features
        stat_info['num_features'] = df.shape[1] - 1
        
        # number of classes
        # -1 if dataset has varying length (drawn from comparing with values at UCR Archive website)
        if dataset_name in varying_length_datasets:
            stat_info['num_classes'] = len(df.iloc[1:, 0].unique()) - 1
        else:
            stat_info['num_classes'] = len(df.iloc[:, 0].unique())
        
        # first sum: sum of missing values per column, second sum: sum of all columns
        stat_info['num_missing_values'] = df.isnull().sum().sum() 
        
        # percentage of missing values
        # total values (exlude label column) - missing values / total values
        total_values = df.shape[0] * df.shape[1] - df.shape[0]
        stat_info['perc_missing_values'] = round(stat_info['num_missing_values'] / total_values * 100, 2)
        
        # number of duplicate instances
        # keep=False marks all duplicates as True, sum() counts the Trues
        stat_info['num_duplicate_instances'] = df.duplicated().sum()
        
        # percentage of duplicate instances
        total_instances = df.shape[0]
        stat_info['perc_duplicate_instances'] = round(stat_info['num_duplicate_instances'] / total_instances * 100, 2)
    except Exception as e:
        print(f"Error while calculating basic statistical information for {str(dataset_name)}: {str(e)}")

    # Append label skew information
    try:
        # get values of the first column (class labels)
        label_values = df.iloc[:, 0].values
        result = distribution.compute(data=label_values)

        # Append label skew information
        stat_info['class_imbalance'] = round(result['label_skew'], 4)
    except Exception as e:
        print(f"Error while calculating class imbalance for {str(dataset_name)}: {str(e)}")
        
    # Append default rate
    try:
        # Total number of instances
        total_instances = df.shape[0]
        # Most common class
        most_common_class = df[df.columns[0]].value_counts().idxmax()
        # Count of instances in the most common class
        count_most_common_class = df[df[df.columns[0]] == most_common_class].shape[0]
        # Default error rate: (Total instances - Instances of most common class) / Total instances
        stat_info['default_rate'] = round((count_most_common_class / total_instances), 4)
    
    except Exception as e:
        print(f"Error while calculating default rate for {dataset_name}: {str(e)}")
    
    # Append autocorrelation information with try-except blocks
    try:
        stat_info['avg_autocorrelation'] = round((df.apply(lambda col: col.autocorr()).mean()), 4)
    except Exception as e:
        print(f"Error while calculating autocorrelation for {str(dataset_name)}: {str(e)}")

    # Perform KS test against a normal distribution
    try:
        # Flatten the dataset (excluding the label column)
        flattened_data = df.iloc[:, 1:].values.flatten()
    
        # Generating a normal distribution with the same mean and std as the dataset
        norm_dist = np.random.normal(np.mean(flattened_data), np.std(flattened_data), len(flattened_data))

        # KS test
        ks_statistic, ks_p_value = kstest(flattened_data, norm_dist)

        # Add KS statistic and p-value to stat_info
        stat_info['ks_statistic'] = round(ks_statistic, 4) # round to 4 digits to avoid scientific notation
        stat_info['ks_p_value'] = round(ks_p_value, 4)
    except Exception as e:
        print(f"Error while calculating KS statistic for {str(dataset_name)}: {str(e)}")
    
    # Turn the dictionary into a DataFrame
    stat_info_df = pd.DataFrame([stat_info])
    
    # Append the statistical information to the DataFrame
    overview_df = pd.concat([overview_df, stat_info_df], ignore_index=True)
    
    print(f"Finished calculating statistics for {str(dataset_name)}")
    
overview_df.to_csv('extractedMetrics.csv', index=False)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Finished calculating statistics for PickupGestureWiimoteZ
Finished calculating statistics for EOGVerticalSignal
Finished calculating statistics for TwoPatterns
Finished calculating statistics for DiatomSizeReduction
Finished calculating statistics for MoteStrain


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Finished calculating statistics for AllGestureWiimoteZ
Finished calculating statistics for AllGestureWiimoteX
Finished calculating statistics for Wafer
Finished calculating statistics for FreezerSmallTrain
Finished calculating statistics for DodgerLoopWeekend
Finished calculating statistics for MixedShapesSmallTrain
Finished calculating statistics for Worms
Finished calculating statistics for SemgHandSubjectCh2
Finished calculating statistics for ProximalPhalanxOutlineCorrect
Finished calculating statistics for GunPoint
Finished calculating statistics for EthanolLevel
Finished calculating statistics for AllGestureWiimoteY
Finished calculating statistics for Plane
Finished calculating statistics for HouseTwenty
Finished calculating statistics for DistalPhalanxOutlineCorrect
Finished calculating statistics for ToeSegmentation1
Finished calculating statistics for SemgHandGenderCh2
Finished calculating statistics for UMD
Finished calculating statistics for Fungi
Finished calculating statis

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Finished calculating statistics for GestureMidAirD1
Finished calculating statistics for LargeKitchenAppliances
Finished calculating statistics for GestureMidAirD3
Finished calculating statistics for ItalyPowerDemand
Finished calculating statistics for UWaveGestureLibraryZ
Finished calculating statistics for Beef
Finished calculating statistics for Wine
Finished calculating statistics for SmallKitchenAppliances
Finished calculating statistics for FaceAll
Finished calculating statistics for Meat
Finished calculating statistics for FordA
Finished calculating statistics for RefrigerationDevices
Finished calculating statistics for OliveOil
Finished calculating statistics for TwoLeadECG
Finished calculating statistics for GestureMidAirD2
Finished calculating statistics for ECG200
Finished calculating statistics for Symbols
Finished calculating statistics for SemgHandMovementCh2
Finished calculating statistics for FreezerRegularTrain
Finished calculating statistics for PowerCons
Finished calc

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Finished calculating statistics for PLAID
Finished calculating statistics for PigAirwayPressure
Finished calculating statistics for FiftyWords
Finished calculating statistics for DistalPhalanxTW
Finished calculating statistics for Lightning7
Finished calculating statistics for DodgerLoopDay
Finished calculating statistics for ECG5000
Finished calculating statistics for ProximalPhalanxOutlineAgeGroup
Finished calculating statistics for ArrowHead
Finished calculating statistics for WordSynonyms
Finished calculating statistics for Car
Finished calculating statistics for UWaveGestureLibraryAll
Finished calculating statistics for FacesUCR
Finished calculating statistics for HandOutlines
Finished calculating statistics for Lightning2
Finished calculating statistics for CricketX
Finished calculating statistics for BeetleFly
Finished calculating statistics for GesturePebbleZ2
Finished calculating statistics for FaceFour
Finished calculating statistics for Computers
Finished calculating statist