In [1]:
import pandas as pd
import zipfile
import io
import os
import numpy as np
import gc
from tqdm import tqdm


# pd.set_option('display.max_rows',None)
# pd.set_option('display.max_columns',None)
pd.options.display.float_format ='{:,.2f}'.format

In [2]:
def reduce_data(df):
    """
    Фунция для сокращения размерности данных путем удаления столбцов "normalized"
    и приведения к меньшему типу числовых значений
    """
    df.drop_duplicates(inplace=True)
    df = df.drop(df.filter(regex="normalized$").columns, axis=1)
    num_features = df.select_dtypes(include="number")
    for col in num_features:
        df[col] = df[col].astype(np.float64)
    return df

Загрузка данных из источника.

In [3]:
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2021.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2021.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2021.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2021.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2022.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2023.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2024.zip -d ./
# !wget https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2024.zip -d ./

Распаковка архива.

In [3]:
data = [
    'data_Q1_2021.zip',
    'data_Q2_2021.zip',
    'data_Q3_2021.zip',
    'data_Q4_2021.zip',
    'data_Q1_2022.zip',
    'data_Q2_2022.zip',
    'data_Q3_2022.zip',
    'data_Q4_2022.zip',
    'data_Q1_2023.zip',
    'data_Q2_2023.zip',
    'data_Q3_2023.zip',
    'data_Q4_2023.zip',
    'data_Q1_2024.zip',
    'data_Q2_2024.zip'
    ]


for filename in data:
    if not os.path.exists(filename[:-4]):
        os.makedirs(filename[:-4])
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(filename[:-4])
    gc.collect()

In [4]:
def create_serial_nums_list(data_dir):
  """
  Фунция создающая список дисков с целевой меткой 1.
  """
  serial_nums = []
  for file in data:
    for filename in tqdm(os.listdir(file[:-4])):
      if filename.endswith(".csv"):
        gc.collect()
        file_path = os.path.join(file[:-4], filename)
        df = pd.read_csv(file_path, encoding='unicode_escape')
        serial_nums.extend(df[df.failure==1].serial_number.unique())
        serial_nums = list(set(serial_nums))
  return serial_nums


In [5]:
serial_nums = create_serial_nums_list(data)

100%|██████████| 91/91 [02:00<00:00,  1.32s/it]
100%|██████████| 92/92 [01:57<00:00,  1.27s/it]
100%|██████████| 93/93 [02:11<00:00,  1.41s/it]
100%|██████████| 93/93 [02:23<00:00,  1.54s/it]
100%|██████████| 91/91 [02:25<00:00,  1.60s/it]
100%|██████████| 91/91 [02:34<00:00,  1.69s/it]
100%|██████████| 93/93 [02:41<00:00,  1.74s/it]
100%|██████████| 93/93 [02:48<00:00,  1.82s/it]
100%|██████████| 91/91 [02:49<00:00,  1.86s/it]
100%|██████████| 91/91 [02:50<00:00,  1.87s/it]
100%|██████████| 93/93 [03:37<00:00,  2.34s/it]
100%|██████████| 93/93 [03:37<00:00,  2.34s/it]
100%|██████████| 92/92 [03:43<00:00,  2.43s/it]
100%|██████████| 92/92 [03:52<00:00,  2.52s/it]


In [6]:
serial_nums = pd.DataFrame(list(set(serial_nums)))
len(serial_nums)

11875

In [8]:
output_path = 'app/data/serial_numbers.csv'
serial_nums.to_csv(output_path)
gc.collect()


0

In [12]:
serial_nums = pd.read_csv('app/data/serial_numbers.csv')

In [9]:
def get_buff_fromQ(data_dir):
  """
  Фунция создающая датафрейм из каждого файла по условиям: диски с целевой меткой и случайно выбранные диски.
  """
  buff = []
  for filename in tqdm(os.listdir(data_dir)):
    if filename.endswith(".csv"):
      gc.collect()
      file_path = os.path.join(data_dir, filename)
      df = pd.read_csv(file_path, encoding='unicode_escape')
      df = reduce_data(df[df.serial_number.isin(serial_nums['0'])])
      buff.append(df.loc[df.failure==1])
      buff.append(df.loc[df.failure==0].sample(15))
      # buff.append(df.sample(10))
  gc.collect()
  df = pd.concat(buff, ignore_index=True)
  return df

Создание датасета.

In [10]:
# разделим на 2 части данные, так как не хватает памяти
data_1 = data[:4]
data_2 = data[4:8]
data_3 = data[8:]

In [13]:
for i, zipfile in enumerate(data_1):
  locals()[f"df_{i+1}"] = get_buff_fromQ(data_dir=zipfile[:-4])
  gc.collect()

100%|██████████| 91/91 [01:53<00:00,  1.25s/it]
100%|██████████| 92/92 [01:59<00:00,  1.30s/it]
100%|██████████| 93/93 [02:11<00:00,  1.42s/it]
100%|██████████| 93/93 [02:22<00:00,  1.54s/it]


In [14]:
df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

In [15]:
for i, zipfile in enumerate(data_2):
  locals()[f"df_{i+1}"] = get_buff_fromQ(data_dir=zipfile[:-4])
  gc.collect()

100%|██████████| 91/91 [02:21<00:00,  1.56s/it]
100%|██████████| 91/91 [02:24<00:00,  1.58s/it]
100%|██████████| 93/93 [02:28<00:00,  1.60s/it]
100%|██████████| 93/93 [02:33<00:00,  1.65s/it]


In [17]:
df = pd.concat([df, df_1, df_2, df_3, df_4], ignore_index=True)
gc.collect()

567

In [18]:
for i, zipfile in enumerate(data_3):
  locals()[f"df_{i+1}"] = get_buff_fromQ(data_dir=zipfile[:-4])
  gc.collect()

100%|██████████| 91/91 [02:34<00:00,  1.70s/it]
100%|██████████| 91/91 [02:41<00:00,  1.78s/it]
100%|██████████| 93/93 [03:27<00:00,  2.23s/it]
100%|██████████| 93/93 [03:49<00:00,  2.46s/it]
100%|██████████| 92/92 [03:47<00:00,  2.47s/it]
100%|██████████| 92/92 [03:57<00:00,  2.58s/it]


In [19]:
df = pd.concat([df, df_1, df_2, df_3, df_4, df_5, df_6], ignore_index=True)

In [20]:
df.shape

(31036, 104)

In [21]:
df.failure.value_counts()

failure
0    19155
1    11881
Name: count, dtype: int64

In [22]:
output_path = 'df_2021_2024.csv'
df.to_csv(output_path)#, single_file=True)

In [23]:
df.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,...,is_legacy_format,smart_71_raw,smart_90_raw,datacenter,cluster_id,pod_slot_num,smart_27_raw,smart_82_raw,smart_211_raw,smart_212_raw
0,2021-02-17,ZA153THY,ST8000NM0055,8001563222016,1,192214128.0,,0.0,8.0,15584.0,...,,,,,,,,,,
1,2021-02-17,40T0A007F97G,TOSHIBA MG07ACA14TA,14000519643136,1,0.0,0.0,7832.0,22.0,1.0,...,,,,,,,,,,
2,2021-02-17,PL1331LAHBS5JH,HGST HMS5C4040BLE640,4000787030016,1,0.0,100.0,419.0,9.0,0.0,...,,,,,,,,,,
3,2021-02-17,ZA11WLX1,ST8000DM002,8001563222016,1,136748944.0,,0.0,15.0,14963.0,...,,,,,,,,,,
4,2021-02-17,1080A12QF9RG,TOSHIBA MG07ACA14TEY,14000519643136,1,0.0,0.0,6992.0,34.0,0.0,...,,,,,,,,,,
