# Извлечение признаков

Считываем данные и удаляем то, что посчитали лишним после EDA

In [54]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from numba import njit
from scipy.integrate import trapezoid
from typing import List

In [43]:
with open('ST14000NM001G.csv') as f:
    data = pd.read_csv(f)

In [44]:
cols = ['model', 'capacity_bytes', 'smart_197_raw']
data = data.drop(columns=cols)

data

Unnamed: 0,date,serial_number,failure,smart_5_raw,smart_9_raw,smart_187_raw,smart_188_raw,smart_192_raw,smart_198_raw,smart_199_raw,smart_240_raw,smart_241_raw,smart_242_raw
0,2020-11-14,ZL201VZX,0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,2.406400e+04
1,2020-11-15,ZL201VZX,0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,3.451200e+04
2,2020-11-16,ZL201VZX,0,0.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,3.744000e+04
3,2020-11-17,ZL201VZX,0,0.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,4.036800e+04
4,2020-11-18,ZL201VZX,0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,5.139200e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7320137,2022-12-27,ZTM096M0,0,0.0,13755.0,0.0,0.0,0.0,0.0,0.0,13067.0,9.432543e+10,1.611121e+11
7320138,2022-12-28,ZTM096M0,0,0.0,13787.0,0.0,0.0,0.0,0.0,0.0,13098.0,9.442932e+10,1.616432e+11
7320139,2022-12-29,ZTM096M0,0,0.0,13804.0,0.0,0.0,0.0,0.0,0.0,13115.0,9.448499e+10,1.619176e+11
7320140,2022-12-30,ZTM096M0,0,0.0,13836.0,0.0,0.0,0.0,0.0,0.0,13146.0,9.463098e+10,1.622885e+11


In [45]:
data_depth_per_disk = (
    data
    .groupby('serial_number', as_index=False)
    .agg(dates_count=('date', 'nunique'))
)
data_depth_per_disk

elim_disks = (
    data_depth_per_disk
    .query('dates_count < 60')
    ['serial_number']
    .tolist()
)

data = data[~data['serial_number'].isin(elim_disks)].copy()

In [51]:
data

Unnamed: 0,date,serial_number,failure,smart_5_raw,smart_9_raw,smart_187_raw,smart_188_raw,smart_192_raw,smart_198_raw,smart_199_raw,smart_240_raw,smart_241_raw,smart_242_raw
0,2020-11-14,ZL201VZX,0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,2.406400e+04
1,2020-11-15,ZL201VZX,0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,3.451200e+04
2,2020-11-16,ZL201VZX,0,0.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,3.744000e+04
3,2020-11-17,ZL201VZX,0,0.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,4.036800e+04
4,2020-11-18,ZL201VZX,0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,5.139200e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7320137,2022-12-27,ZTM096M0,0,0.0,13755.0,0.0,0.0,0.0,0.0,0.0,13067.0,9.432543e+10,1.611121e+11
7320138,2022-12-28,ZTM096M0,0,0.0,13787.0,0.0,0.0,0.0,0.0,0.0,13098.0,9.442932e+10,1.616432e+11
7320139,2022-12-29,ZTM096M0,0,0.0,13804.0,0.0,0.0,0.0,0.0,0.0,13115.0,9.448499e+10,1.619176e+11
7320140,2022-12-30,ZTM096M0,0,0.0,13836.0,0.0,0.0,0.0,0.0,0.0,13146.0,9.463098e+10,1.622885e+11


Извлечение проводилось на основе блоков по 30 дней в каждом (могут быть пропущенные значения дат), начиная с последнего известного дня для диска.

In [None]:
@njit
def create_date_blocks_numba(dates: np.ndarray, days: np.timedelta64 = np.timedelta64(30, 'D')) -> List[np.ndarray]:
    blocks = [] 
    current_block = np.empty(0, dtype=np.int64)
    start_date = dates[0]

    for i in range(len(dates)):
        if start_date - dates[i] < days: # Если не превышаем 30 дней, то добавляем в массив, иначе задаём новый блок
            current_block = np.append(current_block, i) 
        else:
            if current_block.size > 0:
                blocks.append(current_block)
            current_block = np.array([i], dtype=np.int64)
            start_date = dates[i]

    # Проверка для последнего блока
    if current_block.size > 0:
        blocks.append(current_block)

    return blocks

In [16]:
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by=['serial_number', 'date'], ascending=[True, False])

grouped = data.groupby('serial_number')
all_blocks = []

for _, group in tqdm(grouped, desc="Processing groups"):
    # Будем отбирать индексы блоков изначального датафрейма с помощью numba для ускорения
    dates = group['date'].values 
    blocks_indices = create_date_blocks_numba(dates)
    
    for i, block in enumerate(blocks_indices):
        # Отбрасываем первое значение, т.к. это последний блок данных, для которого нет информации о целевой метке
        if i != 0:
            ids_next_block = blocks_indices[i-1]

            data_current_block = group.iloc[block[0]].copy()
            data_current_block_first_day = group.iloc[block[-1]].copy()

            data_current_block["smart_5_raw_diff"] = data_current_block["smart_5_raw"] - data_current_block_first_day["smart_5_raw"]
            data_current_block["smart_187_raw_diff"] = data_current_block["smart_187_raw"] - data_current_block_first_day["smart_187_raw"]
            data_current_block["smart_198_raw_diff"] = data_current_block["smart_198_raw"] - data_current_block_first_day["smart_198_raw"]
            data_current_block["smart_199_raw_max"] = group.iloc[block, 9].max()            

            data_current_block["TargetValue"] = group.iloc[ids_next_block, 2].max() # Таргетное значение - значение метки "failure" следующего блока

            all_blocks.append(data_current_block)
            

Processing groups: 100%|██████████| 10894/10894 [08:35<00:00, 21.12it/s]


In [17]:
column_names = data.columns.tolist()
column_names.extend(["smart_5_raw_diff", "smart_187_raw_diff", "smart_198_raw_diff", "smart_199_raw_max", "TargetValue"])

In [18]:
extracted_data = pd.DataFrame(all_blocks, columns=column_names)

In [19]:
extracted_data

Unnamed: 0,date,serial_number,failure,smart_5_raw,smart_9_raw,smart_187_raw,smart_188_raw,smart_192_raw,smart_198_raw,smart_199_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_5_raw_diff,smart_187_raw_diff,smart_198_raw_diff,smart_199_raw_max,TargetValue
733,2022-12-01,ZL201VZX,0,0.0,17955.0,0.0,0.0,3.0,0.0,0.0,17390.0,1.014789e+11,2.531417e+11,0.0,0.0,0.0,0.0,0
704,2022-11-01,ZL201VZX,0,0.0,17220.0,0.0,0.0,3.0,0.0,0.0,16658.0,9.977201e+10,2.446438e+11,0.0,0.0,0.0,0.0,0
674,2022-10-02,ZL201VZX,0,0.0,16507.0,0.0,0.0,3.0,0.0,0.0,15948.0,9.812132e+10,2.348104e+11,0.0,0.0,0.0,0.0,0
644,2022-09-02,ZL201VZX,0,0.0,15778.0,0.0,0.0,3.0,0.0,0.0,15222.0,9.597480e+10,2.249148e+11,0.0,0.0,0.0,0.0,0
614,2022-08-03,ZL201VZX,0,0.0,15064.0,0.0,0.0,3.0,0.0,0.0,14512.0,9.316893e+10,2.145396e+11,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7319725,2021-11-06,ZTM096M0,0,0.0,3762.0,0.0,0.0,0.0,0.0,0.0,3123.0,4.311050e+10,4.500844e+10,0.0,0.0,0.0,0.0,0
7319696,2021-10-07,ZTM096M0,0,0.0,3044.0,0.0,0.0,0.0,0.0,0.0,2407.0,3.717902e+10,3.707048e+10,0.0,0.0,0.0,0.0,0
7319667,2021-09-07,ZTM096M0,0,0.0,2334.0,0.0,0.0,0.0,0.0,0.0,1701.0,2.878232e+10,2.723231e+10,0.0,0.0,0.0,0.0,0
7319637,2021-08-08,ZTM096M0,0,0.0,1604.0,0.0,0.0,0.0,0.0,0.0,975.0,1.759385e+10,1.608903e+10,0.0,0.0,0.0,0.0,0


In [20]:
extracted_data[extracted_data["TargetValue"] == 1]

Unnamed: 0,date,serial_number,failure,smart_5_raw,smart_9_raw,smart_187_raw,smart_188_raw,smart_192_raw,smart_198_raw,smart_199_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_5_raw_diff,smart_187_raw_diff,smart_198_raw_diff,smart_199_raw_max,TargetValue
59108,2022-07-16,ZL25PMF9,0,0.0,14627.0,7.0,0.000000e+00,2.0,7512.0,0.0,14093.0,9.171405e+10,2.311414e+11,0.0,0.0,3952.0,0.0,1
203653,2022-06-12,ZL298AP8,0,0.0,7799.0,48.0,0.000000e+00,0.0,24.0,0.0,7390.0,6.212461e+10,9.688444e+10,0.0,11.0,0.0,0.0,1
300826,2022-08-15,ZL2A4EXH,0,440.0,9812.0,14.0,0.000000e+00,0.0,64.0,0.0,9309.0,6.800549e+10,1.195749e+11,104.0,1.0,8.0,0.0,1
984701,2022-03-03,ZL2CH3NJ,0,0.0,6577.0,0.0,6.013105e+10,0.0,0.0,174.0,5864.0,5.907168e+10,7.080569e+10,0.0,0.0,0.0,174.0,1
1045559,2022-09-04,ZL2CJDJH,0,0.0,11307.0,0.0,0.000000e+00,0.0,0.0,0.0,11073.0,8.316137e+10,1.342236e+11,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6668326,2022-07-20,ZLW18V02,0,0.0,15943.0,820.0,4.295033e+09,16.0,0.0,0.0,15527.0,8.719891e+10,2.336149e+11,0.0,0.0,0.0,0.0,1
6693383,2022-03-27,ZLW18V17,0,0.0,13306.0,0.0,0.000000e+00,3.0,0.0,0.0,13086.0,8.394320e+10,2.041235e+11,0.0,0.0,0.0,0.0,1
6891092,2022-05-19,ZLW1XLGB,0,0.0,7243.0,18.0,0.000000e+00,0.0,96.0,0.0,6840.0,5.856166e+10,8.853047e+10,0.0,10.0,32.0,0.0,1
6901793,2022-04-05,ZLW1XY7K,0,0.0,6186.0,0.0,0.000000e+00,0.0,0.0,0.0,5797.0,5.516853e+10,7.491716e+10,0.0,0.0,0.0,0.0,1


In [21]:
extracted_data.to_csv("disk_data_blocks_30_days.csv", index=False)

Вычислим значения площадей под графиками интерполировнных данных

In [56]:
def calculate_area(block: pd.Series, column: str) -> pd.DataFrame:
    block_sorted = block[::-1] 
    return trapezoid(block_sorted[column])

In [32]:
columns_for_auc = ['smart_5_raw', 'smart_187_raw', 'smart_198_raw']
data['date'] = pd.to_datetime(data['date'])

# Предварительные вычисления диапазонов дат
date_ranges = data.groupby('serial_number')['date'].agg(['min', 'max']).reset_index()
grouped_data = data.groupby('serial_number')

aggregated_blocks_data = []

for serial, disk_data in tqdm(grouped_data, desc="Processing serial numbers"):
    # Берем минимальную и максимальную дату для серийного номера диска
    start_date = date_ranges.loc[date_ranges['serial_number'] == serial, 'min'].values[0]
    end_date = date_ranges.loc[date_ranges['serial_number'] == serial, 'max'].values[0]
    full_date_range = pd.date_range(start=start_date, end=end_date)

    # Формируем полный датасет без пропусков
    full_data = pd.DataFrame({'date': full_date_range})
    full_data['serial_number'] = serial
    full_data = full_data.set_index(['date', 'serial_number']).join(
        disk_data.set_index(['date', 'serial_number'])
    ).reset_index()

    # Отметим строки с исходными (не интерполированными) данными
    full_data['is_original'] = ~full_data[columns_for_auc].isna().any(axis=1)

    # Интерполяция
    full_data[columns_for_auc] = (
        full_data[columns_for_auc]
        .interpolate(method='linear', limit_direction='forward', axis=0)
        .round(0)
    )

    # Расчет блоков с метками для групп по 30 дней
    full_data = full_data.sort_values(by='date', ascending=False)
    last_date = full_data['date'].iloc[0]
    full_data['block'] = (last_date - full_data['date']).dt.days // 30

    # Данные для AUC
    data_to_process = full_data[['date', 'block', 'is_original'] + columns_for_auc]

    area_by_block = (data_to_process.groupby('block')[columns_for_auc].apply(
                        lambda block: {col: calculate_area(block, col) for col in columns_for_auc}
                     ).reset_index())

    # Преобразуем данные из формата словаря
    area_by_block_data = pd.DataFrame(area_by_block[0].tolist())
    area_by_block_data.columns = [f'AUC_{col}' for col in columns_for_auc]

    # Добавляем последнюю известную (не интерполированную) дату
    last_known_dates = data_to_process[data_to_process['is_original']].groupby('block')['date'].first().reset_index()

    area_by_block = pd.concat([last_known_dates['date'], area_by_block_data], axis=1)

    # Добавим серийный номер, чтобы объединить данные
    area_by_block['serial_number'] = serial

    # Отбрасываем первый блок
    area_by_block = area_by_block.iloc[1:]

    aggregated_blocks_data.append(area_by_block)

final_result = pd.concat(aggregated_blocks_data, ignore_index=True)

final_result.head()

Processing serial numbers: 100%|██████████| 10894/10894 [03:33<00:00, 51.14it/s]


Unnamed: 0,date,AUC_smart_5_raw,AUC_smart_187_raw,AUC_smart_198_raw,serial_number
0,2022-12-01,0.0,0.0,0.0,ZL201VZX
1,2022-11-01,0.0,0.0,0.0,ZL201VZX
2,2022-10-02,0.0,0.0,0.0,ZL201VZX
3,2022-09-02,0.0,0.0,0.0,ZL201VZX
4,2022-08-03,0.0,0.0,0.0,ZL201VZX


In [33]:
final_result.to_csv("AUCs_data.csv", index=False)

# Объединение файлов с интерполяционными данными и реальными

In [34]:
with open('AUCs_data.csv') as f:
    auc_results = pd.read_csv(f)

with open('disk_data_blocks_30_days.csv') as f:
    extracted_main_data = pd.read_csv(f)

In [35]:
auc_results

Unnamed: 0,date,AUC_smart_5_raw,AUC_smart_187_raw,AUC_smart_198_raw,serial_number
0,2022-12-01,0.0,0.0,0.0,ZL201VZX
1,2022-11-01,0.0,0.0,0.0,ZL201VZX
2,2022-10-02,0.0,0.0,0.0,ZL201VZX
3,2022-09-02,0.0,0.0,0.0,ZL201VZX
4,2022-08-03,0.0,0.0,0.0,ZL201VZX
...,...,...,...,...,...
240618,2021-11-06,0.0,0.0,0.0,ZTM096M0
240619,2021-10-07,0.0,0.0,0.0,ZTM096M0
240620,2021-09-07,0.0,0.0,0.0,ZTM096M0
240621,2021-08-08,0.0,0.0,0.0,ZTM096M0


In [39]:
full_data = extracted_main_data.merge(auc_results, on=['date', 'serial_number'], how='inner')

columns = [col for col in full_data.columns if col != 'TargetValue']
full_data = full_data[columns + ['TargetValue']]

In [40]:
full_data.to_csv("data_for_models.csv", index=False)