# Training

In [3]:
import os
import pandas as pd

def find_failure_rows_in_files(folder_path):
    # 初始化空的列表来保存正负样本
    failure_1_samples = []
    failure_0_samples = []

    # 遍历文件夹中的所有 CSV 文件
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")

            # 读取 CSV 文件
            df = pd.read_csv(file_path)

            # 确保文件中有 "failure" 列
            if 'failure' not in df.columns:
                print(f"Skipping file {filename}: 'failure' column not found.")
                continue

            # 筛选出 failure=1 和 failure=0 的行
            failure_1_rows = df[df['failure'] == 1]
            failure_0_rows = df[df['failure'] == 0]

            # 添加到正负样本集合中，直到各自有 1491 行
            if len(failure_1_samples) < 1491:
                # 计算需要添加的正样本数量
                remaining_1 = 1491 - len(failure_1_samples)
                failure_1_samples.extend(failure_1_rows.head(remaining_1).to_dict(orient='records'))

            if len(failure_0_samples) < 1491:
                # 计算需要添加的负样本数量
                remaining_0 = 1491 - len(failure_0_samples)
                failure_0_samples.extend(failure_0_rows.head(remaining_0).to_dict(orient='records'))

            # 如果两个集合都已满足条件，停止遍历文件
            if len(failure_1_samples) >= 1491 and len(failure_0_samples) >= 1491:
                print(f"Found 1491 positive and 1491 negative samples in {filename}. Stopping.")
                break  # 停止遍历文件夹

    # 将正负样本合并成一个 DataFrame 返回
    final_samples = failure_1_samples[:1491] + failure_0_samples[:1491]
    final_df = pd.DataFrame(final_samples)
    return final_df

# 使用该函数
folder_path = "/mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset"  # 修改为你的文件夹路径
result_df = find_failure_rows_in_files(folder_path)

# 显示最终结果
print(f"Final DataFrame shape: {result_df.shape}")


Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-01.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-02.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-03.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-04.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-05.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-06.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-07.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-08.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-09.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_Prediction/dataset/2020-01-10.csv
Processing file: /mnt/raid5/sum/card/storage/AI4Storage/HDD_

In [4]:
valid = result_df[result_df['failure'] == 0]
failed = result_df[result_df['failure'] == 1]
print("valid hdds:", len(valid))
print("failed hdds:", len(failed))

valid hdds: 1491
failed hdds: 1491


In [7]:
valid.iloc[-1]

date                        2020-01-01
serial_number                 ZJV2ERHT
model                    ST12000NM0007
capacity_bytes          12000138625024
failure                              0
                             ...      
smart_245_raw                      NaN
smart_247_normalized               NaN
smart_247_raw                      NaN
smart_248_normalized               NaN
smart_248_raw                      NaN
Name: 2981, Length: 149, dtype: object

In [8]:
result_df.to_csv('filtered_samples.csv', index=False)


In [9]:
result_df.model.value_counts()

model
ST12000NM0007                         782
ST4000DM000                           513
ST8000NM0055                          317
ST12000NM0008                         229
ST8000DM002                           221
HGST HMS5C4040BLE640                  179
HGST HUH721212ALN604                  176
TOSHIBA MG07ACA14TA                   159
TOSHIBA MQ01ABF050                    103
HGST HMS5C4040ALE640                   43
TOSHIBA MQ01ABF050M                    38
ST500LM012 HN                          37
ST10000NM0086                          35
HGST HUH721212ALE600                   30
ST12000NM001G                          29
WDC WD5000LPVX                         17
ST500LM030                             14
ST14000NM001G                          13
ST6000DX000                            10
HGST HUH721212ALE604                    9
HGST HUH728080ALE600                    8
ST18000NM000J                           2
ST8000DM004                             2
TOSHIBA MD04ABA400V         

In [10]:
df_data_model = result_df[result_df['model'] == 'ST12000NM0007']

In [11]:
features_specified = []
features = [5, 9, 187, 188, 193, 194, 197, 198, 241, 242]
for feature in features:
    features_specified += ["smart_{0}_raw".format(feature)]

In [12]:
X_data = df_data_model[features_specified]
Y_data = df_data_model['failure']

In [13]:
X_data.isnull().sum()


smart_5_raw      1
smart_9_raw      1
smart_187_raw    1
smart_188_raw    1
smart_193_raw    1
smart_194_raw    1
smart_197_raw    1
smart_198_raw    1
smart_241_raw    1
smart_242_raw    1
dtype: int64

In [14]:
X_data = X_data.fillna(0) 


In [16]:
import numpy as np
print("valid hdds:", len(Y_data) - np.sum(Y_data.values))
print("failed hdds:", np.sum(Y_data.values))

valid hdds: 438
failed hdds: 344


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=0) 

In [18]:
from sklearn.ensemble import RandomForestClassifier 

rfc = RandomForestClassifier()

In [19]:
rfc.fit(X_train, Y_train)


In [20]:
Y_pred = rfc.predict(X_test) 


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Model used is: Random Forest classifier") 
  
acc = accuracy_score(Y_test, Y_pred) 
print("The accuracy is {}".format(acc)) 
  
prec = precision_score(Y_test, Y_pred) 
print("The precision is {}".format(prec)) 
  
rec = recall_score(Y_test, Y_pred) 
print("The recall is {}".format(rec)) 
  
f1 = f1_score(Y_test, Y_pred) 
print("The F1-Score is {}".format(f1)) 

Model used is: Random Forest classifier
The accuracy is 0.9554140127388535
The precision is 0.984375
The recall is 0.9130434782608695
The F1-Score is 0.9473684210526315


# Prediction