In [2]:
from huggingface_hub import login
from datasets import load_dataset

# Prompt the user to log in if not already authenticated
login()  # This opens a prompt for your Hugging Face token

# Load just the CalIt2 dataset
calit2 = load_dataset("PLAN-Lab/mTSBench", data_dir="CalIt2")

# Convert to pandas
df_train = calit2["train"].to_pandas()
df_test = calit2["test"].to_pandas()


Generating train split: 940 examples [00:00, 203691.14 examples/s]
Generating validation split: 1008 examples [00:00, 338255.74 examples/s]
Generating test split: 4032 examples [00:00, 885183.65 examples/s]


In [3]:
!git lfs install
!git clone https://huggingface.co/datasets/PLAN-Lab/mTSBench

Updated Git hooks.
Git LFS initialized.
Cloning into 'mTSBench'...
remote: Enumerating objects: 749, done.[K
remote: Counting objects: 100% (746/746), done.[K
remote: Compressing objects: 100% (745/745), done.[K
remote: Total 749 (delta 303), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (749/749), 328.19 MiB | 3.83 MiB/s, done.
Resolving deltas: 100% (303/303), done.
Updating files: 100% (725/725), done.
Filtering content: 100% (97/97), 2.28 GiB | 23.34 MiB/s, done.
Encountered 619 files that should have been pointers, but weren't:
	CalIt2/CalIt2_traffic_test.csv
	CalIt2/CalIt2_traffic_train.csv
	CalIt2/CalIt2_traffic_val.csv
	Daphnet/Daphnet_S01R01E1_test.csv
	Daphnet/Daphnet_S01R01E1_train.csv
	Daphnet/Daphnet_S02R01E0_test.csv
	Daphnet/Daphnet_S02R01E0_train.csv
	Daphnet/Daphnet_S02R02E0_test.csv
	Daphnet/Daphnet_S02R02E0_train.csv
	Daphnet/Daphnet_S03R01E0_test.csv
	Daphnet/Daphnet_S03R01E0_train.csv
	Daphnet/Daphnet_S03R01E1_test.csv
	Daphnet/Daphnet_S0

# get dataset summary 

In [6]:
import os
import pandas as pd

def analyze_csv_files(folder_path, base_dirs, output_csv):
    all_analysis = []

    for base_dir in base_dirs:
        if not os.path.isdir(folder_path+base_dir):
            continue
        base_dir = folder_path+base_dir
        for filename in os.listdir(base_dir):
            if filename.endswith(".csv"):
                file_path = os.path.join(base_dir, filename)
                print("file_path", file_path)
                try:
                    df = pd.read_csv(file_path)

                    num_rows = len(df)
                    num_nans = df.isna().sum().sum()
                    num_channels = df.shape[1] - (1 if 'is_anomaly' in df.columns else 0)

                    if 'is_anomaly' in df.columns:
                        anomalies = df['is_anomaly'].values

                        # Identify sequences of anomalies
                        sequence_anomalies = []
                        start = None
                        for i, val in enumerate(anomalies):
                            if val == 1 and start is None:
                                start = i
                            if val == 0 and start is not None:
                                if i - start > 1:
                                    sequence_anomalies.append((start, i-1))
                                start = None
                        if start is not None and len(anomalies) - start > 1:
                            sequence_anomalies.append((start, len(anomalies)-1))

                        point_anomalies = [idx for idx, val in enumerate(anomalies) if val == 1]
                        point_anomalies = [idx for idx in point_anomalies if all(not (start <= idx <= end) for (start, end) in sequence_anomalies)]

                        num_point_anomalies = len(point_anomalies)
                        num_sequence_anomalies = len(sequence_anomalies)

                        top3_point_indices = point_anomalies[:3]
                        top3_sequence_indices = sequence_anomalies[:3]
                    else:
                        num_point_anomalies = 'N/A'
                        num_sequence_anomalies = 'N/A'
                        top3_point_indices = 'N/A'
                        top3_sequence_indices = 'N/A'

                    all_analysis.append({
                        "Folder": base_dir,
                        "File": filename,
                        "NumRows": num_rows,
                        "NumNaNs": num_nans,
                        "NumChannels": num_channels,
                        "NumPointAnomalies": num_point_anomalies,
                        "NumSequenceAnomalies": num_sequence_anomalies,
                        "Top3PointIndices": top3_point_indices,
                        "Top3SequenceIndices": top3_sequence_indices
                    })
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    if all_analysis:
        analysis_df = pd.DataFrame(all_analysis)
        analysis_df.to_csv(output_csv, index=False)
        print(f"Saved analysis to {output_csv}")
    else:
        print("No CSV files found for analysis.")

# Example usage
base_folders = [
    "CalIt2", "cicids", "creditcard", "Daphnet", "Exathlon",
    "GECCO", "Genesis", "GHL", "GutenTAG",  "metro", "MITDB",
    "MSL", "room-occupancy", "OPPORTUNITY", "PSM", "SMAP", "SMD", "SVDB", "swan"
]
folder_path = 'mTSBench/'
analyze_csv_files(folder_path,base_folders, "data_summary.csv")


file_path mTSBench/CalIt2/CalIt2_traffic_val.csv
file_path mTSBench/CalIt2/CalIt2_traffic_train.csv
file_path mTSBench/CalIt2/CalIt2_traffic_test.csv
file_path mTSBench/cicids/cicids_5_train.csv
file_path mTSBench/cicids/cicids_3_test.csv
file_path mTSBench/cicids/cicids_2_test.csv
file_path mTSBench/cicids/cicids_3_train.csv
file_path mTSBench/cicids/cicids_0_val.csv
file_path mTSBench/cicids/cicids_2_train.csv
file_path mTSBench/cicids/cicids_5_test.csv
file_path mTSBench/cicids/cicids_7_train.csv
file_path mTSBench/cicids/cicids_0_test.csv
file_path mTSBench/cicids/cicids_7_test.csv
file_path mTSBench/cicids/cicids_6_test.csv
file_path mTSBench/cicids/cicids_6_train.csv
file_path mTSBench/cicids/cicids_0_train.csv
file_path mTSBench/creditcard/creditcard_val.csv
file_path mTSBench/creditcard/creditcard_test.csv
file_path mTSBench/creditcard/creditcard_train.csv
file_path mTSBench/Daphnet/Daphnet_S08R01E0_train.csv
file_path mTSBench/Daphnet/Daphnet_S02R02E0_train.csv
file_path mTSBe