In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import tensorflow as tf

print(tf.__version__)

# Suppress TF logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# prevent keras/tf from allocating all gpu memory
gpu=2 # set to number of GPU we want to leverage for training
tf.config.set_soft_device_placement(True) # set soft device placement to enabled
tf.debugging.set_log_device_placement(True) # turns logging for device placement decisions on
for each_device in tf.config.experimental.list_physical_devices('GPU'): 
    tf.config.experimental.set_memory_growth(each_device, True)

2.11.0


In [2]:
import pandas as pd


df_train = pd.read_pickle('/kaggle/input/trainingset/train.pickle')
df_test = pd.read_pickle('/kaggle/input/testpickledata/test.pickle')

print(df_train.shape)
print(df_test.shape)

(6699353, 179)
(1701967, 149)


In [3]:
df_train.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2022-06-14,Z305DEMX,ST4000DM000,4000787030016,0,114.0,58759240.0,,,92.0,...,,,,,,,,,,
1,2022-06-14,S301NGZV,ST4000DM000,4000787030016,0,114.0,79309352.0,,,91.0,...,,,,,,,,,,
2,2022-06-14,S3010M7R,ST4000DM000,4000787030016,0,117.0,135418976.0,,,91.0,...,,,,,,,,,,
3,2022-06-14,S3010M7H,ST4000DM000,4000787030016,0,114.0,69451256.0,,,91.0,...,,,,,,,,,,
4,2022-06-14,S3010M7E,ST4000DM000,4000787030016,0,114.0,65508312.0,,,92.0,...,,,,,,,,,,


In [4]:
def prepare_dataframe(df):
    
    # find and drop all normalized columns
    cols = [c for c in df.columns if (c.lower().find("normalized")!=-1)]
    df=df.drop(columns=cols)

    # also drop the model and capacity columns as we don't need them
    df = df.drop(columns=['model','capacity_bytes'])

    # convert string dates to datetime format
    df['date'] = pd.to_datetime(df['date'])

    # sort data by serial number and dates so ready for sequence creation
    df = df.sort_values(by=['serial_number', 'date'], axis=0, ascending=True)
    df = df.reset_index(drop=True)
    
    # ensure no NaN's in data
    df = df.fillna(0)
    
    return df

In [5]:
print("Processing train set...")
df_train = prepare_dataframe(df_train)

print("")
print("Processing test set...")
df_test = prepare_dataframe(df_test)

Processing train set...

Processing test set...


In [6]:
# Routine to return serial numbers of good and bad disks

def get_disk_serials(df, num_disks):
    # Get failed serial numbers
    failed_serials = df[df['failure'] == 1]['serial_number'].unique().tolist()

    # Get serial numbers for disks that didn't fail - first remove failed disks
    df_tmp = df[~df.serial_number.isin(failed_serials)]
    normal_serials = df_tmp.serial_number.value_counts()[:num_disks].index.tolist()

    print('Normal Disk Serials:',len(normal_serials))
    print('Failed Disk Serials:',len(failed_serials))

    return normal_serials, failed_serials

In [7]:
import numpy as np

def adjust_dates(df_loc): 
    df_mod = pd.DataFrame()
    cur_serial = df_loc['serial_number'].unique().tolist()[0]
    col_list = df_loc.columns.tolist()
    cur_dates = df_loc['date'].values
    
    # determine number of days between last record and first record
    num_date_range = int((cur_dates[-1] - cur_dates[0]).astype('timedelta64[D]')/ np.timedelta64(1, 'D'))+1
    
    # do we have records for each day or are there holes ?  If so, fill them.
    if num_date_range > cur_dates.shape[0]:
        i_low = 0
        
        # step through all days to ensure next date correct
        for i in range(cur_dates.shape[0]-1): 
            
            # calculate number of days between current data and next data - should be 1 day
            diff_days = int((cur_dates[i+1] - cur_dates[i]).astype('timedelta64[D]')/ np.timedelta64(1, 'D'))
            
            # if not 1 day, fill in missing days with forward fill
            if diff_days > 1:
                df_mod = df_mod.append(df_loc.iloc[i_low:i+1])
                tmp_array = np.empty((diff_days-1,len(col_list),))
                tmp_array[:] = np.nan
                df_add = pd.DataFrame(tmp_array,columns=col_list)
                df_add['date'] = [ cur_dates[i] + np.timedelta64(1, 'D')*j for j in range(1,diff_days)]
                df_mod = df_mod.append(df_add)
                i_low = i+1

        # add missing records and use forward fill to update missing sensor data
        df_mod = df_mod.append(df_loc.iloc[i_low:])
        df_mod = df_mod.fillna(method="ffill")
    else:
        df_mod = df_loc 
    
    return df_mod 

In [8]:
def fix_date_gaps(df, normal_serials=None, failed_serials=None):
    df_fixed = pd.DataFrame()

    serials_list = normal_serials + failed_serials 
    for i, cur_serial in enumerate(serials_list): 
        df_fixed = df_fixed.append(adjust_dates(df[df['serial_number'] == cur_serial]))
        
    return df_fixed.reset_index(drop=True)

In [9]:
# Routine to return failed sequences 
def create_failed_sequences(df, sequence_length, lookahead):

    failed_serials = df.serial_number.unique().tolist()
    print("Number of failed serials : ", len(failed_serials)) 

    failed_seq_list = []
    for serial in failed_serials:
        df_tmp = df[df['serial_number'] == serial]
        df_tmp = df_tmp.reset_index(drop=True)
        num_recs = df_tmp.index.size
        
        # if enough records, add failed sequence
        if num_recs > (sequence_length+lookahead): 
            # find first failure
            df_failed = df_tmp[df_tmp['failure'] == 1]
            
            # find end of sequence - going back "lookahead" days from failure
            idx2 = df_failed.index[0] - lookahead + 1
            
            # find beginning of sequence
            idx1 = idx2 - sequence_length
            
            if idx1 > 0: 
                failed_seq_list.append(df_tmp.iloc[idx1:idx2,:])
    
    print("Number of failed sequences :", len(failed_seq_list)) 
    
    return pd.concat(failed_seq_list)

In [10]:
# Routine to pick some serial_numbers and create all sequences from those disks up to num_normal sequences
def create_normal_sequences(df, sequence_length, num_normal, lookahead, day_step=1):
    normal_seq_list = []
    num_seq = 0
    
    # ensure no failed sequences
    if df[df['failure'] == 1].index.size > 0: return None 
    
    # get list of normal serial numbers
    normal_serials = df.serial_number.unique().tolist()
    
    print("Number of normal serials : ", len(normal_serials)) 
    
    for serial in normal_serials:
        df_tmp = df[df['serial_number'] == serial]
        num_recs = df_tmp.shape[0]
        
        # 
        for i in range(0, num_recs-(sequence_length+lookahead)+1, day_step):
            if (num_seq < num_normal):
                normal_seq_list.append(df_tmp.iloc[i:i+sequence_length])
                num_seq += 1
    
    print("Number of normal sequences :", len(normal_seq_list))
    
    return pd.concat(normal_seq_list)


In [11]:
# Routine to add column "sequence_label" indicating whether this was a normal or failed sequence.
# We will use this later for as label for training.
def label_sequence(df, label):
    df.insert(2, 'sequence_label', np.full(df.shape[0], label), True)
    return

In [12]:
# Specify key parameters - you can change and re-run to test different configurations
sequence_length = 5          # Number of days in sequence to train to detect and predict failure
lookahead_days = 1           # Number of days in future to predict failure
num_normal_disks = 20        # Maximum number of normal disks to look at
max_normal_sequences = 4000  # Maximum number of normal sequences to create

In [13]:
# get pools of serial numbers for good and bad disks
print("Determining good and bad disk serial numbers")
normal_serials, bad_serials = get_disk_serials(df_train, num_normal_disks)

df_selected = df_train[df_train.serial_number.isin(normal_serials + bad_serials)]
print('Total Records before fixing date gaps :', df_selected.shape)

df_selected = fix_date_gaps(df_selected, normal_serials, bad_serials)
print('Total Records after fixing date gaps  :', df_selected.shape)
    
# get sequences
print("Creating sequences for normal disks")
normal_sequences = create_normal_sequences(df_selected[df_selected.serial_number.isin(normal_serials)], 
                                           sequence_length, max_normal_sequences, lookahead_days)

# add field to indicate normal sequence
label_sequence(normal_sequences, 0)

print("Creating sequences for failed disks")
failure_sequences = create_failed_sequences(df_selected[df_selected.serial_number.isin(bad_serials)],
                                            sequence_length, lookahead_days)

# add field to indicate failing sequence
label_sequence(failure_sequences, 1)

# combine to creating training set
train_samples = pd.concat([normal_sequences, failure_sequences]).reset_index(drop=True)

Determining good and bad disk serial numbers
Normal Disk Serials: 20
Failed Disk Serials: 633
Total Records before fixing date gaps : (127868, 90)
Total Records after fixing date gaps  : (128351, 90)
Creating sequences for normal disks
Number of normal serials :  20
Number of normal sequences : 4000
Creating sequences for failed disks
Number of failed serials :  633
Number of failed sequences : 623


In [14]:
normal_sequences.head(2*sequence_length)

Unnamed: 0,date,serial_number,sequence_label,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,...,smart_244_raw,smart_245_raw,smart_246_raw,smart_247_raw,smart_248_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
0,2022-01-01,Z304JW6N,0,0.0,25996168.0,0.0,0.0,19.0,0.0,98186184.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-01-02,Z304JW6N,0,0.0,128209576.0,0.0,0.0,19.0,0.0,99008758.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-01-03,Z304JW6N,0,0.0,210850872.0,0.0,0.0,19.0,0.0,99950493.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-01-04,Z304JW6N,0,0.0,97628104.0,0.0,0.0,19.0,0.0,100966476.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-01-05,Z304JW6N,0,0.0,242619856.0,0.0,0.0,19.0,0.0,101986428.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-01-02,Z304JW6N,0,0.0,128209576.0,0.0,0.0,19.0,0.0,99008758.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-01-03,Z304JW6N,0,0.0,210850872.0,0.0,0.0,19.0,0.0,99950493.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-01-04,Z304JW6N,0,0.0,97628104.0,0.0,0.0,19.0,0.0,100966476.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-01-05,Z304JW6N,0,0.0,242619856.0,0.0,0.0,19.0,0.0,101986428.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2022-01-06,Z304JW6N,0,0.0,159723328.0,0.0,0.0,19.0,0.0,102794383.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
print('First Failure Sequence')
failure_sequences.head(sequence_length)

First Failure Sequence


Unnamed: 0,date,serial_number,sequence_label,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,...,smart_244_raw,smart_245_raw,smart_246_raw,smart_247_raw,smart_248_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
103,2022-04-14,S300VKW9,1,0.0,2101856.0,0.0,0.0,11.0,0.0,422221438.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,2022-04-15,S300VKW9,1,0.0,141112840.0,0.0,0.0,11.0,0.0,423052568.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,2022-04-16,S300VKW9,1,0.0,83385056.0,0.0,0.0,11.0,0.0,425661334.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,2022-04-17,S300VKW9,1,0.0,5495848.0,0.0,0.0,11.0,0.0,427798373.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107,2022-04-18,S300VKW9,1,0.0,195585848.0,0.0,0.0,11.0,0.0,430883722.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
first_fail_serial = failure_sequences['serial_number'].iloc[0]
df_tmp = df_train[df_train['serial_number'] == first_fail_serial]
first_fail_index = df_tmp[df_tmp['failure'] == 1].iloc[0]

print("We should see a failure below",lookahead_days, "days after the sequence above.")
df_tmp[df_tmp['failure'] == 1].head(1)

We should see a failure below 1 days after the sequence above.


Unnamed: 0,date,serial_number,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_8_raw,...,smart_244_raw,smart_245_raw,smart_246_raw,smart_247_raw,smart_248_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
22743,2022-04-19,S300VKW9,1,98031816.0,0.0,0.0,11.0,0.0,433039199.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# check samples
num_seq = int(train_samples.shape[0] / sequence_length)
print('Training Sequences:', num_seq)

Training Sequences: 4623


In [18]:
import gc

# Store filesnames for later use
train_file = 'traindata' + str(sequence_length) + '.pkl'
test_file = 'testdata' + str(sequence_length) + '.pkl'

# Save away training data
train_samples.to_pickle(train_file)
del (train_samples)
gc.collect()

105

In [19]:
# get pools of serial numbers for good and bad disks
print("Determining good and bad disk serial numbers")
normal_serials, bad_serials = get_disk_serials(df_test, num_normal_disks)

df_selected = df_test[df_test.serial_number.isin(normal_serials + bad_serials)]
print('Total Records before fixing date gaps :', df_selected.shape)

df_selected = fix_date_gaps(df_selected, normal_serials, bad_serials)
print('Total Records after fixing date gaps  :', df_selected.shape)
    
# get sequences
print("Creating sequences for normal disks")
normal_sequences = create_normal_sequences(df_selected[df_selected.serial_number.isin(normal_serials)], 
                                           sequence_length, max_normal_sequences, lookahead_days)

# add field to indicate normal sequence
label_sequence(normal_sequences, 0)

print("Creating sequences for failed disks")
failure_sequences = create_failed_sequences(df_selected[df_selected.serial_number.isin(bad_serials)],
                                            sequence_length, lookahead_days)

# add field to indicate failure sequence
label_sequence(failure_sequences, 1)

# combine to creating training set
test_samples = pd.concat([normal_sequences, failure_sequences]).reset_index(drop=True)

Determining good and bad disk serial numbers
Normal Disk Serials: 20
Failed Disk Serials: 59
Total Records before fixing date gaps : (4459, 75)
Total Records after fixing date gaps  : (4463, 75)
Creating sequences for normal disks
Number of normal serials :  20
Number of normal sequences : 1700
Creating sequences for failed disks
Number of failed serials :  59
Number of failed sequences : 55


In [20]:
# store away training and test samples
test_samples.to_pickle(test_file)
del (test_samples)
gc.collect()

21