## dataset info

In [17]:
import pandas as pd
import numpy as np
import zipfile
import os
from pathlib import Path
from tqdm import tqdm

In [24]:
data_dir = Path('../Datasets/')
zip_file = data_dir / 'EVTOL.zip'
extract_dir = data_dir / 'EVTOL'

if not extract_dir.exists():
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
else: # if the directory exists, skip the extraction
    pass

files = list(extract_dir.glob('*.csv'))
# files
extract_dir

PosixPath('../Datasets/EVTOL')

## base info for 1st csv

In [19]:
test_csv = files[0]
df = pd.read_csv(test_csv)
print(f"Base info for {test_csv}:")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Column names: {df.columns.tolist()}")
print(f"Data types: {df.dtypes}")



Base info for ../Datasets/EVTOL/VAH07.csv:
Number of rows: 205487
Number of columns: 10
Column names: ['time_s', 'Ecell_V', 'I_mA', 'EnergyCharge_W_h', 'QCharge_mA_h', 'EnergyDischarge_W_h', 'QDischarge_mA_h', 'Temperature__C', 'cycleNumber', 'Ns']
Data types: time_s                 float64
Ecell_V                float64
I_mA                   float64
EnergyCharge_W_h       float64
QCharge_mA_h           float64
EnergyDischarge_W_h    float64
QDischarge_mA_h        float64
Temperature__C         float64
cycleNumber              int64
Ns                       int64
dtype: object


In [41]:
#  根据reamdme的描述，标记异常数据
anomalies = {
    "VAH05": [1000],
    "VAH09": [64, 92, 154, 691],
    "VAH10": [248, 631, 735, 1151],
    "VAH11": [817, 1898],
    "VAH13": [816, 817],
    "VAH25": [461, 462],
    "VAH26": [872, 873],
    "VAH27": [20, 256, 257, 585],
    "VAH28": [256, 257, 619, 620, 1066, 1067],
}
all_abnormal_df = []
for file_name in tqdm(files):
    print(file_name.name)
    file_id = file_name.name.split("_")[-1].replace(".csv", "")
    df = pd.read_csv(file_name)
    df["cycleNumber"] = pd.to_numeric(df["cycleNumber"], errors="coerce").astype("Int64")
    if file_id in anomalies:
        bad_cycles = anomalies[file_id]
        abnormal_df = df[df["cycleNumber"].isin(bad_cycles)].copy()
        abnormal_df["FileID"] = file_id
        # extra label for abnormal data
        abnormal_df["is_abnormal"] = True
        abnormal_df["cycle_length"] = abnormal_df.groupby(["FileID", "cycleNumber"])["cycleNumber"].transform("count")
        all_abnormal_df.append(abnormal_df)
    else:
        pass
all_abnormal_df = pd.concat(all_abnormal_df)
all_abnormal_df.to_csv("abnormal_data.csv", index=False)
all_abnormal_df

  0%|          | 0/22 [00:00<?, ?it/s]

VAH07.csv
VAH25.csv


  9%|▉         | 2/22 [00:00<00:02,  8.51it/s]

VAH12.csv


 18%|█▊        | 4/22 [00:00<00:04,  3.93it/s]

VAH23.csv
VAH09.csv


 23%|██▎       | 5/22 [00:01<00:04,  3.87it/s]

VAH17.csv


 32%|███▏      | 7/22 [00:01<00:03,  4.64it/s]

VAH15.csv
VAH06.csv


 36%|███▋      | 8/22 [00:01<00:03,  4.13it/s]

VAH01.csv


 41%|████      | 9/22 [00:02<00:03,  3.28it/s]

VAH05.csv


 45%|████▌     | 10/22 [00:02<00:03,  3.04it/s]

VAH13.csv


 55%|█████▍    | 12/22 [00:03<00:02,  3.95it/s]

VAH27.csv
VAH20.csv


 59%|█████▉    | 13/22 [00:03<00:01,  4.53it/s]

VAH10.csv


 64%|██████▎   | 14/22 [00:03<00:02,  3.85it/s]

VAH11.csv


 68%|██████▊   | 15/22 [00:04<00:02,  2.88it/s]

VAH26.csv


 73%|███████▎  | 16/22 [00:04<00:02,  3.00it/s]

VAH28.csv


 82%|████████▏ | 18/22 [00:04<00:01,  3.69it/s]

VAH22.csv
VAH16.csv


 86%|████████▋ | 19/22 [00:05<00:00,  4.39it/s]

VAH02.csv


 91%|█████████ | 20/22 [00:05<00:00,  3.64it/s]

VAH24.csv


 95%|█████████▌| 21/22 [00:05<00:00,  3.78it/s]

VAH30.csv


100%|██████████| 22/22 [00:05<00:00,  3.73it/s]


Unnamed: 0,time_s,Ecell_V,I_mA,EnergyCharge_W_h,QCharge_mA_h,EnergyDischarge_W_h,QDischarge_mA_h,Temperature__C,cycleNumber,Ns,FileID,is_abnormal,cycle_length
55508,4.474443e+05,3.762047,3002.8000,0.093796,24.932184,0.000000,0.000000,25.431843,64,0,VAH09,True,1545
55509,4.474743e+05,3.779857,3003.1938,0.188434,49.969750,0.000000,0.000000,25.416067,64,0,VAH09,True,1545
55510,4.475043e+05,3.791875,3004.7703,0.283375,75.007521,0.000000,0.000000,25.550154,64,0,VAH09,True,1545
55511,4.475343e+05,3.802396,3005.3611,0.378577,100.045087,0.000000,0.000000,26.023399,64,0,VAH09,True,1545
55512,4.475643e+05,3.812759,3004.1792,0.474040,125.082692,0.000000,0.000000,25.897202,64,0,VAH09,True,1545
...,...,...,...,...,...,...,...,...,...,...,...,...,...
908417,1.145801e+07,3.489152,0.0000,7.525902,0.000000,-6.235343,1864.631031,33.745472,248,7,VAH10,True,1047
908418,1.145804e+07,3.489230,0.0000,7.525902,0.000000,-6.235343,1864.631031,33.501129,248,7,VAH10,True,1047
908419,1.145807e+07,3.489664,0.0000,7.525902,0.000000,-6.235343,1864.631031,33.548420,248,7,VAH10,True,1047
908420,1.145810e+07,3.489782,0.0000,7.525902,0.000000,-6.235343,1864.631031,33.264668,248,7,VAH10,True,1047
