## dataset info

In [3]:
import pandas as pd
import numpy as np
import zipfile
import os
from pathlib import Path
from tqdm import tqdm

In [4]:
data_dir = Path('../Datasets/')
zip_file = data_dir / 'EVTOL.zip'
extract_dir = data_dir / 'EVTOL'

if not extract_dir.exists():
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
else: # if the directory exists, skip the extraction
    pass

files = list(extract_dir.glob('*.csv'))
# files
extract_dir

PosixPath('../Datasets/EVTOL')

## base info for 1st csv

In [5]:
test_csv = files[0]
df = pd.read_csv(test_csv)
print(f"Base info for {test_csv}:")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Column names: {df.columns.tolist()}")
print(f"Data types: {df.dtypes}")



Base info for ../Datasets/EVTOL/VAH07.csv:
Number of rows: 205487
Number of columns: 10
Column names: ['time_s', 'Ecell_V', 'I_mA', 'EnergyCharge_W_h', 'QCharge_mA_h', 'EnergyDischarge_W_h', 'QDischarge_mA_h', 'Temperature__C', 'cycleNumber', 'Ns']
Data types: time_s                 float64
Ecell_V                float64
I_mA                   float64
EnergyCharge_W_h       float64
QCharge_mA_h           float64
EnergyDischarge_W_h    float64
QDischarge_mA_h        float64
Temperature__C         float64
cycleNumber              int64
Ns                       int64
dtype: object


In [6]:
#  根据reamdme的描述，标记异常数据
anomalies = {
"VAH05": [1000],
"VAH09": [64, 92, 154, 691],
"VAH10": [248, 631, 735, 1151],
"VAH11": [817, 1898],
"VAH13": [816, 817],
"VAH25": [461, 462],
"VAH26": [872, 873],
"VAH27": [20, 256, 257, 585],
"VAH28": [256, 257, 619, 620, 1066, 1067],
}
all_abnormal_df = []
all_normal_df = []
all_data = []
for file_name in tqdm(files):
    file_id = file_name.name.split("_")[-1].replace(".csv", "")
    df = pd.read_csv(file_name)
    df["cycleNumber"] = pd.to_numeric(df["cycleNumber"], errors="coerce").astype("Int64")
    df["FileID"] = file_id
    
    if file_id in anomalies:
        bad_cycles = anomalies[file_id]
        # 异常数据
        abnormal_df = df[df["cycleNumber"].isin(bad_cycles)].copy()
        abnormal_df["is_abnormal"] = True
        
        # 正常数据（同一文件中的其他cycle）
        normal_df = df[~df["cycleNumber"].isin(bad_cycles)].copy()  
        normal_df["is_abnormal"] = False
        
        all_data.extend([abnormal_df, normal_df])
    else:
        # 全部正常数据
        normal_df = df.copy()
        normal_df["is_abnormal"] = False
        all_data.append(normal_df)
combined_df = pd.concat(all_data, ignore_index=True)


  0%|          | 0/22 [00:00<?, ?it/s]

  5%|▍         | 1/22 [00:00<00:02,  9.33it/s]

VAH07.csv
VAH25.csv


  9%|▉         | 2/22 [00:00<00:03,  6.40it/s]

VAH12.csv


 14%|█▎        | 3/22 [00:00<00:07,  2.50it/s]

VAH23.csv


 18%|█▊        | 4/22 [00:01<00:06,  2.86it/s]

VAH09.csv


 23%|██▎       | 5/22 [00:01<00:05,  3.05it/s]

VAH17.csv


 32%|███▏      | 7/22 [00:02<00:04,  3.70it/s]

VAH15.csv
VAH06.csv


 36%|███▋      | 8/22 [00:02<00:04,  3.16it/s]

VAH01.csv


 41%|████      | 9/22 [00:02<00:05,  2.55it/s]

VAH05.csv


 45%|████▌     | 10/22 [00:03<00:04,  2.42it/s]

VAH13.csv


 55%|█████▍    | 12/22 [00:03<00:03,  3.30it/s]

VAH27.csv
VAH20.csv


 59%|█████▉    | 13/22 [00:04<00:02,  3.85it/s]

VAH10.csv


 64%|██████▎   | 14/22 [00:04<00:02,  3.27it/s]

VAH11.csv


 68%|██████▊   | 15/22 [00:05<00:02,  2.48it/s]

VAH26.csv


 73%|███████▎  | 16/22 [00:05<00:02,  2.74it/s]

VAH28.csv


 82%|████████▏ | 18/22 [00:05<00:01,  3.42it/s]

VAH22.csv
VAH16.csv


 86%|████████▋ | 19/22 [00:05<00:00,  3.99it/s]

VAH02.csv


 91%|█████████ | 20/22 [00:06<00:00,  3.09it/s]

VAH24.csv


 95%|█████████▌| 21/22 [00:06<00:00,  3.13it/s]

VAH30.csv


100%|██████████| 22/22 [00:07<00:00,  3.12it/s]
