In [34]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [32]:
def convert_str2date(line):
    return datetime.strptime(line, "%Y-%m-%d")


def convert_date2str(d):
    return d.strftime("%Y-%m-%d")

def remove_days(date, days):
    date = convert_str2date(date)
    return convert_date2str(date - timedelta(days=days))

def choose_start_date(obj):
    first_date, lifetime_days = convert_str2date(obj['first_time_seen']), obj['lifetime_days']
    shift_end = lifetime_days - history - 1
    if shift_end < 0:
        raise RuntimeError("Count of days is too small")
    shift_in_days = random.randint(0, shift_end)
    return convert_date2str(first_date + timedelta(days=shift_in_days))

def form_interval(row):
    start_date = choose_start_date(row)
    end_date = remove_days(start_date, -history)
    return start_date, end_date

def get_all_csvs(folder):
    return list(iget_next_csv(folder))


In [None]:
def dump_data(in_path, out_path, failured_sns, healthy_sns):
    sns = {**failured_sns, **healthy_sns}

    def valid_row(row):
        serial_number = row['serial_number']
        if serial_number not in sns:
            return False
        start, end = sns[serial_number]
        return start <= row['date'] <= end

    header = None
    count = 0
    with open(out_path, 'w') as out_csv:
        csv_writer = None
        # get_all_csvs instead of iget_next_csv to show a progress bar with %
        for _, csv_filepath in tqdm(get_all_csvs(in_path), desc='Iterate through files in {}'.format(in_path)):
            with open(csv_filepath) as inp_csv:
                csv_reader = csv.DictReader(inp_csv)
                for row in csv_reader:
                    if csv_writer:
                        if not valid_row(row):
                            continue
                        row['failure'] = int(row['serial_number'] in failured_sns)
                        # filter new columns
                        row = {key: value for key, value in row.items() if key in header}
                        csv_writer.writerow(row)
                        count += 1
                        continue
                    header = row.keys()
                    csv_writer = csv.DictWriter(out_csv, fieldnames=header)
                    csv_writer.writeheader()
    print('Dump data into: {}, (size: {})'.format(out_path, count))

In [22]:
path = 'C:/ZST/0. Mine/02-code/HardDriveAnalysis/res/'
df = pd.read_csv(path+'stats_2019.csv')

In [27]:
model = 'ST4000DM000'
history = 120
health_drives_count = 10*1000

In [25]:
df = df[(df.model == model) & (~df.failure | (df.failure_date == df.last_time_seen))]
df['lifetime_days'] = (df.last_time_seen.apply(convert_str2date) - df.first_time_seen.apply(convert_str2date)).apply(lambda x: x.days) + 1
df = df[df.lifetime_days >= history]
# all failured drives
df_failured = df[df.failure]
df_healthy = df[~df.failure]


In [28]:
df_failured_serial_numbers = {
        sn: (remove_days(last_date, history-1), remove_days(last_date, -1))  # [) interval
        for sn, last_date in df_failured[['serial_number', 'last_time_seen']].values
    }
df_failured_serial_numbers_count = len(df_failured_serial_numbers)
# form healthy serial numbers
if health_drives_count < df_healthy.shape[0]:
    df_healthy = df_healthy.sample(health_drives_count)

In [33]:
df_healthy_serial_numbers = {}
for _, row in df_healthy.iterrows():
    try:
        sn = row['serial_number']
        df_healthy_serial_numbers[sn] = form_interval(row)
    except RuntimeError:
        pass