# Imports

In [10]:
import glob
import pandas as pd
import os
from statsmodels.tsa.stattools import adfuller
import evaluate

In [19]:
# Use glob to get a list of TSV file paths in the folder
tsv_files = glob.glob(os.path.join("UCR_Datasets", '*.tsv'))

# Initialize an empty DataFrame
columns = ['dataset_name', 'num_instances', 'num_features', 'num_classes', 'label_skew', 'num_missing_values', 'perc_missing_values', 'num_duplicate_instances', 'perc_duplicate_instances', 'avg_autocorrelation', 'avg_adf', 'avg_pvalue']
overview_df = pd.DataFrame(columns=columns)

# Preparing the Huggingface Evauation of label distribution
distribution = evaluate.load("label_distribution")

# Loop through each TSV file and read it into a DataFrame - append the name of the file
for tsv_file in tsv_files:
    # Read the TSV file into a DataFrame
    df = pd.read_csv(tsv_file, sep='\t')

    # Extract the dataset name from the file path
    dataset_name = os.path.splitext(os.path.basename(tsv_file))[0]

    # Initialize an empty dictionary to store the statistical information
    stat_info = {}
    
    # Append basic statistical information
    try:
        # dataset name
        stat_info['dataset_name'] = dataset_name
        
        # number of rows/instances
        stat_info['num_instances'] = df.shape[0]

        # number of columns/features; subtract 1 for the class label
        stat_info['num_features'] = df.shape[1] - 1

        # select the first column (class label) and get the number of unique values
        stat_info['num_classes'] = len(df.iloc[:, 0].unique())

        # first sum: sum of missing values per column, second sum: sum of all columns
        stat_info['num_missing_values'] = df.isnull().sum().sum()   
        stat_info['perc_missing_values'] = (stat_info['num_missing_values'] / (stat_info['num_instances'] * stat_info['num_features'])) * 100
        
        # number of duplicate instances
        # keep=False marks all duplicates as True, sum() counts the Trues
        stat_info['num_duplicate_instances'] = df.duplicated().sum()
        stat_info['perc_duplicate_instances'] = (stat_info['num_duplicate_instances'] / stat_info['num_instances']) * 100
    except Exception as e:
        print(f"Error while calculating basic statistical information for {str(dataset_name)}: {str(e)}")

    try:
        # get values of the first column (class labels)
        label_values = df.iloc[:, 0].values
        result = distribution.compute(data=label_values)

        # Append label skew information
        stat_info['label_skew'] = result['label_skew']
    except Exception as e:
        print(f"Error while calculating label skew for {str(dataset_name)}: {str(e)}")
        
    # Append autocorrelation information with try-except blocks
    try:
        stat_info['avg_autocorrelation'] = df.apply(lambda col: col.autocorr()).mean()
    except Exception as e:
        print(f"Error while calculating autocorrelation for {str(dataset_name)}: {str(e)}")

    # Append ADF information with try-except blocks
    try:
        adf_values = df.apply(lambda col: adfuller(col))
        stat_info['avg_adf'] = adf_values[0].mean()
        stat_info['avg_pvalue'] = adf_values[1].mean()
    except Exception as e:
        print(f"Error while calculating ADF information for {str(dataset_name)}: {str(e)}")
        
    # Append the dictionary to the dataframe
    overview_df = overview_df.append(stat_info, ignore_index=True)
    
overview_df.to_csv('overview_extra.csv', index=False)

Error while calculating ADF information for GunPointMaleVersusFemale_TEST: 0


  overview_df = overview_df.append(stat_info, ignore_index=True)


Error while calculating ADF information for BirdChicken_TRAIN: 0


  overview_df = overview_df.append(stat_info, ignore_index=True)


Error while calculating ADF information for ScreenType_TEST: 0


  overview_df = overview_df.append(stat_info, ignore_index=True)


Error while calculating ADF information for Earthquakes_TEST: 0


  overview_df = overview_df.append(stat_info, ignore_index=True)


Error while calculating ADF information for MixedShapesSmallTrain_TRAIN: 0


  overview_df = overview_df.append(stat_info, ignore_index=True)


Error while calculating ADF information for MiddlePhalanxOutlineAgeGroup_TEST: 0


  overview_df = overview_df.append(stat_info, ignore_index=True)


KeyboardInterrupt: 