In [4]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    confusion_matrix,
    f1_score
)

# 定义数据加载函数
def load_data_from_folder(folder_path):
    all_data = []
    # count = 0
    for file in tqdm(os.listdir(folder_path)):
        # if count > 5:
        #     break
        if file.endswith('.pkl'):
            file_path = os.path.join(folder_path, file)
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                # count += 1
                all_data.extend(data)  # 每个文件有 10000 条记录，合并到一起
    return all_data

# 数据处理函数
def process_data(data_list, max_days=10, sep='<SEP>'):
    X, y = [], []
    for record in tqdm(data_list):
        data = record['data']
        label = record['label']
        
        # 数据预处理：分割 <SEP>，填充缺失值
        split_data = " ".join(map(str, data)).split(sep)
        processed_record = []
        
        for day in split_data:
            # 每天的数据：[时间戳, SMART1, SMART2, ...]
            day_data = day.split()
            if len(day_data) > 1:  # 如果有有效数据
                try:
                    smart_data = [0.0 if x == "\\N" else float(x) for x in day_data[1:]] # 跳过时间戳，转换为 float
                except ValueError:
                    print(f"Error processing record: {day_data}")
                processed_record.extend(smart_data)
            else:
                processed_record.extend([0] * (len(processed_record) // max_days))  # 填充 0
        
        # 如果不足 max_days，进行填充
        missing_days = max_days - len(processed_record) // (len(processed_record) // max_days)
        for _ in range(missing_days):
            processed_record.extend([0] * (len(processed_record) // max_days))
        
        X.append(processed_record)
        y.append(label)
    
    return np.array(X), np.array(y)

# 定义 SVM 模型
def train_svm(X_train, y_train, X_test, y_test):
    scaler = StandardScaler()  # 数据标准化
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = SVC(kernel='rbf', probability=True)  # 使用 RBF 核的 SVM
    model.fit(X_train, y_train)
    
    # 模型预测
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    
    # 基本信息：标签为 0 和 1 的数量
    num_zeros = sum(y_test == 0)
    num_ones = sum(y_test == 1)
    print(f"Test set label counts - 0: {num_zeros}, 1: {num_ones}")
    
    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    # 计算 TPR 和 FPR
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    # 计算其他指标
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    # 输出结果
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print(f"TPR (True Positive Rate): {tpr:.2f}")
    print(f"FPR (False Positive Rate): {fpr:.2f}")
    print(f"F1-score: {f1:.2f}")
    print(f"AUC score: {auc:.2f}")
    
    return model

def main(data_folder):
    print("Loading data...")
    data = load_data_from_folder(data_folder)
    print(f"Total records loaded: {len(data)}")
    if len(data) == 0:
        raise ValueError("No valid data found in the folder. Please check the data files.")
    
    print("Processing data...")
    X, y = process_data(data)
    print(f"Processed Features Shape: {X.shape}, Labels Shape: {y.shape}")
    if X.shape[0] == 0:
        raise ValueError("Processed data is empty. Please check the preprocessing steps.")
    
    print("Splitting dataset...")
    if len(X) < 5:
        print("Insufficient data for train-test split. Using all data for training.")
        X_train, y_train = X, y
        X_test, y_test = X, y
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("Filling NaN values with 0...")
    X_train = np.nan_to_num(X_train, nan=0)
    X_test = np.nan_to_num(X_test, nan=0)
    
    print("Training SVM model...")
    model = train_svm(X_train, y_train, X_test, y_test)
    print("Model training complete.")
    return model


In [5]:
data_folder = "/mnt/raid5/sum/card/storage/AI4Storage/datasets/norm_10ahead_15window"  # 替换为你的数据文件夹路径
svm_model = main(data_folder)

Loading data...


100%|██████████| 7/7 [00:01<00:00,  4.11it/s]


Total records loaded: 61246
Processing data...


100%|██████████| 61246/61246 [00:06<00:00, 9711.99it/s] 


Processed Features Shape: (61246, 180), Labels Shape: (61246,)
Splitting dataset...
Filling NaN values with 0...
Training SVM model...
Test set label counts - 0: 7724, 1: 4526
Accuracy: 0.6302040816326531
Classification Report:
               precision    recall  f1-score   support

           0       0.63      1.00      0.77      7724
           1       0.00      0.00      0.00      4526

    accuracy                           0.63     12250
   macro avg       0.32      0.50      0.39     12250
weighted avg       0.40      0.63      0.49     12250

TPR (True Positive Rate): 0.00
FPR (False Positive Rate): 0.00
F1-score: 0.00
AUC score: 0.55
Model training complete.


In [14]:
data_folder = "/mnt/raid5/sum/card/storage/StreamDFP/dataset/train"  # 替换为你的数据文件夹路径
svm_model = main(data_folder)

Loading data...


100%|██████████| 3/3 [00:00<00:00,  4.03it/s]


Total records loaded: 30002
Processing data...


100%|██████████| 30002/30002 [00:02<00:00, 14517.78it/s]


Processed Features Shape: (30002, 120), Labels Shape: (30002,)
Splitting dataset...
Filling NaN values with 0...
Training SVM model...
Accuracy: 0.8363606065655724
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.98      0.89      4100
           1       0.92      0.53      0.67      1901

    accuracy                           0.84      6001
   macro avg       0.87      0.75      0.78      6001
weighted avg       0.85      0.84      0.82      6001

Model training complete.


In [5]:
# data_folder = "../datasets/raw_10ahead_10window"
data = load_data_from_folder(data_folder)  # 加载数据
print(f"Total records loaded: {len(data)}")

 20%|██        | 7/35 [00:02<00:11,  2.36it/s]

Total records loaded: 60045





In [7]:
data[:4]

[{'disk_id': 20195,
  'model': 'MA1',
  'data': ['2018-01-03',
   4294967295.0,
   31439.0,
   27.0,
   0.0,
   0.0,
   25.0,
   nan,
   0.0,
   0.0,
   19.0,
   165090774.0,
   0.0,
   '<SEP>',
   '2018-01-04',
   4294967295.0,
   31463.0,
   27.0,
   0.0,
   0.0,
   25.0,
   nan,
   0.0,
   0.0,
   19.0,
   166211995.0,
   0.0,
   '<SEP>',
   '2018-01-05',
   4294967295.0,
   31487.0,
   27.0,
   0.0,
   0.0,
   25.0,
   nan,
   0.0,
   0.0,
   19.0,
   167352623.0,
   0.0,
   '<SEP>',
   '2018-01-06',
   4294967295.0,
   31511.0,
   27.0,
   0.0,
   0.0,
   25.0,
   nan,
   0.0,
   0.0,
   20.0,
   168410038.0,
   0.0,
   '<SEP>',
   '2018-01-07',
   4294967295.0,
   31535.0,
   27.0,
   0.0,
   0.0,
   25.0,
   nan,
   0.0,
   0.0,
   19.0,
   169403180.0,
   0.0,
   '<SEP>',
   '2018-01-08',
   4294967295.0,
   31559.0,
   27.0,
   0.0,
   0.0,
   25.0,
   nan,
   0.0,
   0.0,
   19.0,
   170489910.0,
   0.0,
   '<SEP>',
   '2018-01-09',
   4294967295.0,
   31583.0,
   27.0,
   0.

In [None]:
X, y = process_data(data)  # 处理数据
print(f"Features shape: {X.shape}, Labels shape: {y.shape}")

In [6]:
print("Sample record:", data)

Sample record: []
