In [1]:
import pandas as pd
import zipfile
import io
import os
import numpy as np
import gc
from tqdm import tqdm


# pd.set_option('display.max_rows',None)
# pd.set_option('display.max_columns',None)
pd.options.display.float_format ='{:,.2f}'.format

In [2]:
def reduce_data(df):
  """
  Фунция для сокращения размерности данных путем удаления столбцов "normalized"
  и приведения к меньшему типу числовых значений
  """
  # df['capacity_bytes'] = (df['capacity_bytes'] / 1000000000000).astype(np.int64) # to terabytes
  df = df.drop(df.filter(regex='normalized$').columns, axis=1)
  # df = df.drop(df.columns[df.isna().all()], axis = 1) #all nan
  num_features = df.select_dtypes(include="number")
  for el in num_features:
        for n_type in (np.int32, np.int16, np.int8):
            if df[el].isna().sum() == 0:
              if (df[el] == df[el].astype(n_type)).sum() == len(df):
                  df[el] = df[el].astype(n_type)
  return df

Загрузка данных из источника.

In [3]:
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2024.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2024.zip -d ./

Распаковка архива.

In [4]:
data = [
    'data_Q1_2022.zip',
    'data_Q2_2022.zip',
    'data_Q3_2022.zip',
    'data_Q4_2022.zip',
    'data_Q1_2023.zip',
    'data_Q2_2023.zip',
    'data_Q3_2023.zip',
    'data_Q4_2023.zip',
    'data_Q1_2024.zip',
    'data_Q2_2024.zip'
    ]

# for filename in data:
#   with zipfile.ZipFile(filename, 'r') as zip_ref:
#       zip_ref.extractall()
#   gc.collect()

In [5]:
models = pd.read_csv('models.csv')
models_ssd_list = models[models['type'] == 'SSD'].model.unique()
models_ssd_list

array(['Seagate BarraCuda 120 SSD ZA250CM10003', 'CT250MX500SSD1',
       'WD Blue SA510 2.5 250GB', 'Seagate BarraCuda SSD ZA250CM10002',
       'Seagate SSD', 'WDC WDS250G2B0A', 'Seagate IronWolf ZA250NM10002',
       'MTFDDAV240TCB', 'Seagate FireCuda 120 SSD ZA500GM10001',
       'Micron 5300 MTFDDAK480TDS', 'SSDSCKKB480G8R',
       'Seagate BarraCuda SSD ZA500CM10002'], dtype=object)

In [6]:
def append_period_col(df):
    """Фунция добавляющая колонку с периодои между датой начала исследования и датой
    посленей записи по каждому диску"""
    df["date"] = pd.to_datetime(df["date"])
    first_date = pd.to_datetime("01-01-2022")
    df["days_between"] = df.groupby("serial_number")["date"].transform(
        lambda x: (x.max() - first_date).days
    )
    return df

In [7]:
def get_buff_fromQ(data_dir):
  """
  Фунция создающая датафрейм из каждого файла по условиям: диски с целевой меткой и случайно выбранные диски.
  """
  buff = []
  for filename in tqdm(os.listdir(data_dir)):
    if filename.endswith(".csv"):
      gc.collect()
      file_path = os.path.join(data_dir, filename)
      df = pd.read_csv(file_path, encoding='unicode_escape')
      df = reduce_data(df)
      buff.append(df.loc[df.failure==1])
      # buff.append(df[df.model.isin(models_ssd_list)].sample(5))
      buff.append(df.sample(10))
  gc.collect()
  df = pd.concat(buff, ignore_index=True)
  return df

Создание датасета.

In [8]:
# разделим на 2 части данные, так как не хватает памяти
data_1 = data[:4]
data_2 = data[4:]

In [9]:
for i, zipfile in enumerate(data_1):
  locals()[f"df_{i+1}"] = get_buff_fromQ(data_dir=zipfile[:-4])
  gc.collect()

100%|██████████| 91/91 [02:37<00:00,  1.73s/it]
100%|██████████| 91/91 [02:33<00:00,  1.69s/it]
100%|██████████| 93/93 [02:44<00:00,  1.76s/it]
100%|██████████| 93/93 [02:49<00:00,  1.82s/it]


In [10]:
df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

In [11]:
for i, zipfile in enumerate(data_2):
  locals()[f"df_{i+1}"] = get_buff_fromQ(data_dir=zipfile[:-4])
  gc.collect()

100%|██████████| 91/91 [02:49<00:00,  1.86s/it]
100%|██████████| 91/91 [02:52<00:00,  1.90s/it]
100%|██████████| 93/93 [03:38<00:00,  2.35s/it]
100%|██████████| 93/93 [03:56<00:00,  2.55s/it]
100%|██████████| 92/92 [03:43<00:00,  2.43s/it]
100%|██████████| 92/92 [03:54<00:00,  2.55s/it]


In [12]:
df = pd.concat([df,df_1, df_2, df_3, df_4, df_5, df_6], ignore_index=True)

Добавление колонки с периодом

In [13]:
df = append_period_col(df)

  df["days_between"] = df.groupby("serial_number")["date"].transform(


In [14]:
df.shape

(18843, 105)

In [15]:
df.failure.value_counts()

failure
1    9723
0    9120
Name: count, dtype: int64

In [16]:
output_path = 'df_2022_2024.csv'
df.to_csv(output_path)#, single_file=True)

In [17]:
df.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,...,smart_71_raw,smart_90_raw,datacenter,cluster_id,pod_slot_num,smart_27_raw,smart_82_raw,smart_211_raw,smart_212_raw,days_between
0,2022-01-28,10K0A0BWF97G,TOSHIBA MG07ACA14TA,14000519643136,1,0.0,0.0,2597.0,186.0,0.0,...,,,,,,,,,,27
1,2022-01-28,ZHZ62N5X,ST12000NM0008,12000138625024,1,230825528.0,,0.0,5.0,208.0,...,,,,,,,,,,27
2,2022-01-28,Z302SZP4,ST4000DM000,4000787030016,1,26511648.0,,0.0,17.0,8.0,...,,,,,,,,,,27
3,2022-01-28,ZLW18DZ3,ST14000NM001G,14000519643136,1,186696748.0,,0.0,3.0,0.0,...,,,,,,,,,,27
4,2022-01-28,Z305D2FN,ST4000DM000,4000787030016,1,169787688.0,,0.0,14.0,8.0,...,,,,,,,,,,27
