In [619]:
import warnings 
warnings.filterwarnings("ignore") 
import numpy as np
import pandas as pd 
import os
from os import listdir
import seaborn as sb
from matplotlib import pyplot as plt
from collections import Counter
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, recall_score,roc_auc_score,accuracy_score,confusion_matrix,f1_score,log_loss
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from scipy.stats import randint as sp_randint
import xgboost as xgb
from sklearn.utils import compute_class_weight
import math
from datetime import datetime
from datetime import timedelta
from scipy.sparse import hstack
from tqdm import tqdm_notebook as tqdm
from itertools import repeat
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from statistics import mode,mean,stdev,variance
from mlxtend.classifier import EnsembleVoteClassifier
import pickle

In [620]:
os.chdir(r"D:\vysh\Hard_drive_failure")

### 1. Get Test Data - raw_input, target_values

In [621]:
df_segate_backtrack=pd.read_csv("df_segate_backtrack.csv")
features=['date', 'model', 'serial_number', 'capacity_bytes', 'failure',
       'smart_5_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw',
       'smart_188_raw', 'smart_190_raw', 'smart_193_raw', 'smart_194_raw',
       'smart_197_raw', 'smart_198_raw', 'smart_199_raw', 'smart_241_raw',
       'smart_242_raw']
df_segate_backtrack=df_segate_backtrack[features]

#selecting different serial number hard drives (some working, some failed)
serial_numbers=list(df_segate_backtrack[df_segate_backtrack['failure']==0].serial_number.unique()[0:20])
serial_numbers.extend(list(df_segate_backtrack[df_segate_backtrack['failure']==1].serial_number.unique()[0:20]))

#input_data
df=pd.DataFrame([])
for serial in serial_numbers:
    df=df.append(df_segate_backtrack[df_segate_backtrack['serial_number']==serial])

target_values=np.array(df['failure']).reshape(-1,1)
raw_input=df.drop(columns={'failure'})
print("shape of test_data: ",raw_input.shape)

shape of test_data:  (1537, 17)


In [622]:
raw_input.head()

Unnamed: 0,date,model,serial_number,capacity_bytes,smart_5_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_188_raw,smart_190_raw,smart_193_raw,smart_194_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_241_raw,smart_242_raw
0,2019-07-01,ST4000DM000,Z305B2QN,4000787030016,0.0,31048.0,13.0,0.0,0.0,21.0,34185.0,21.0,0.0,0.0,0.0,48633990000.0,142505000000.0
19569,2019-07-02,ST4000DM000,Z305B2QN,4000787030016,0.0,31072.0,13.0,0.0,0.0,21.0,34185.0,21.0,0.0,0.0,0.0,48649710000.0,142644600000.0
39137,2019-07-03,ST4000DM000,Z305B2QN,4000787030016,0.0,31096.0,13.0,0.0,0.0,21.0,34185.0,21.0,0.0,0.0,0.0,48676700000.0,142803300000.0
58705,2019-07-04,ST4000DM000,Z305B2QN,4000787030016,0.0,31119.0,13.0,0.0,0.0,22.0,34185.0,22.0,0.0,0.0,0.0,48701820000.0,142958900000.0
78272,2019-07-05,ST4000DM000,Z305B2QN,4000787030016,0.0,31143.0,13.0,0.0,0.0,21.0,34185.0,21.0,0.0,0.0,0.0,48720820000.0,143148700000.0


In [623]:
columns=['smart_5_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw',
       'smart_188_raw', 'smart_190_raw', 'smart_193_raw', 'smart_194_raw',
       'smart_197_raw', 'smart_198_raw', 'smart_199_raw', 'smart_241_raw',
       'smart_242_raw']

### 2. Functions for pre-processing, feature engineering

In [624]:
def preprocessing(df):
    #filling missing values with mean
    for column in columns:
        if math.isnan(df[column].mean()):
            df[column]=df[column].fillna(0)
        else:
            df[column]=df[column].fillna(df[column].mean())
    return df

In [625]:
def rolling_mean_stdev(df):
    #calculating rolling_mean, rolling_stdev for smart parameters - window 15
    serial_numbers=df['serial_number'].values
    serial_number=df['serial_number'].values[0]
    for column in columns:
        rolling_mean=[]
        rolling_stdev=[]
        for i in range(df.shape[0]):
            if serial_numbers[i]!=serial_numbers[i-1]:
                values=[] 
                values.append(df[column].values[i])
                rolling_mean.append(mean(values))
                rolling_stdev.append(values[-1])
            else:
                if(len(values)<15): 
                    values.append(df[column].values[i])
                    mean_=mean(values[0:len(values)])
                    stdev_=stdev(values[0:len(values)])
                    rolling_mean.append(mean_)
                    rolling_stdev.append(stdev_)
                else:
                    values.append(df[column].values[i])
                    mean_=mean(values[len(values)-15:len(values)])
                    stdev_=stdev(values[len(values)-15:len(values)])
                    rolling_mean.append(mean_)
                    rolling_stdev.append(stdev_)
        df[column+'_rolling_mean'] = rolling_mean
        df[column+'_rolling_stdev'] = rolling_stdev
    
    return df

In [626]:
def expanding_mean_stdev(df):
    #calculating expanding_mean, expanding_stdev for smart parameters
    serial_numbers=df['serial_number'].values
    serial_number=df['serial_number'].values[0]
    for column in (columns):
        expanding_mean=[]
        expanding_stdev=[]
        for i in range(df.shape[0]):
            if serial_numbers[i]!=serial_numbers[i-1]:
                values=[] 
                values.append(df[column].values[i])
                expanding_mean.append(sum(values))
                expanding_stdev.append(values[-1])
            else:
                values.append(df[column].values[i])
                expanding_mean.append(mean(values))
                expanding_stdev.append(stdev(values))
        df[column+'_expanding_mean'] = expanding_mean
        df[column+'_expanding_stdev'] = expanding_stdev 
    
    return df

In [627]:
def exponential_smoothing(df):
    #exponential smoothing for smart parameters with alpha=0.15
    serial_numbers=df['serial_number'].values
    serial_number=df['serial_number'].values[0]
    alpha=0.15
    for column in (columns):
        predicted_values=[]
        for i in range(df.shape[0]):
            if serial_numbers[i]!=serial_numbers[i-1]:
                predicted_value = (df[column].values)[i]
                predicted_values.append(predicted_value)
            else:
                predicted_value =(alpha*df[column].values[i]) + ((1-alpha)*predicted_value)
                predicted_values.append(predicted_value)
        df[column+'_exp_avg'] = predicted_values
    
    return df

In [628]:
def model_id_features(df):
    #Model ID characters count
    #saving and loading dicts using pickle
    #https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict
    with open("model_char_count.pickle","rb") as fp:
        model_char_count=pickle.load(fp)

    keys=model_char_count.keys()
    values=model_char_count.values()

    model_char_count_test=[]
    for model in df['model']:
        if model not in keys:
            #for model not in train but in cv/test, using most appeared count
            model_char_count_test.append(mode(values))
        else:
            model_char_count_test.append(model_char_count[model])
    df['model_char_count']=model_char_count_test

    #Model ID second and last characters
    with open("model_second_last_chars.pickle","rb") as fp:
        second_last_chars=pickle.load(fp)

    keys=second_last_chars.keys()
    model_second_last_char_test=[]
    for model in df['model']:
        if model not in keys:
            model_second_last_char_test.append('NULL')
        else:
            model_second_last_char_test.append(second_last_chars[model])
    df['model_second_last_char']=model_second_last_char_test

    #response coding for Model ID second_last_chars
    with open("model_prob_dict.pickle","rb") as fp:
        model_prob_dict=pickle.load(fp)
    keys=model_prob_dict.keys()
    test_model_second_last_char_response_code=[]
    for model_second_last_char in df['model_second_last_char']:
        if model_second_last_char not in keys:
            test_model_second_last_char_response_code.append([0.5,0.5])
        else:
            test_model_second_last_char_response_code.append(model_prob_dict.get(model_second_last_char))
    df['model_second_last_char_working']=np.array(test_model_second_last_char_response_code)[:,0]
    df['model_second_last_char_fail']=np.array(test_model_second_last_char_response_code)[:,1]
    
    return df

In [629]:
def serial_number_features(df):
    #serial_number second, last chars
    with open("serial_second_last_chars.pickle","rb") as fp:
        serial_second_last_chars=pickle.load(fp)
    keys=serial_second_last_chars.keys()
    serial_number_second_last_char_test=[]
    for serial_number in df['serial_number']:
        if serial_number not in keys:
            serial_number_second_last_char_test.append('NULL')
        else:
            serial_number_second_last_char_test.append(serial_second_last_chars[serial_number])
    df['serial_number_second_last_char']=serial_number_second_last_char_test
    
    #response coding for serial_number second_last chars
    with open("serial_prob_dict.pickle","rb") as fp:
        serial_prob_dict=pickle.load(fp)
    keys=serial_prob_dict.keys()
    test_serial_number_second_last_char_response_code=[]
    for serial_number_second_last_char in df['serial_number_second_last_char']:
        if serial_number_second_last_char not in keys:
            test_serial_number_second_last_char_response_code.append([0.5,0.5])
        else:
            test_serial_number_second_last_char_response_code.append(serial_prob_dict.get(serial_number_second_last_char))
    df['serial_second_last_char_working']=np.array(test_serial_number_second_last_char_response_code)[:,0]
    df['serial_second_last_char_fail']=np.array(test_serial_number_second_last_char_response_code)[:,1]
    
    return df

In [630]:
def get_top_50_features():
    #all_columns=['model_char_count','model_second_last_char_working','serial_second_last_char_working']
    #all_columns.extend(df.columns[3:86])
    with open("top_50_features.txt","rb") as fp:
        top_50_features=pickle.load(fp)
        
    return top_50_features

In [631]:
def feature_engineering(df):
    #calculating mean,std,min,max for all the smart parameters row wise in the data
    df['mean']=df[columns].mean(axis=1)
    df['std']=df[columns].std(axis=1)
    df['min']=df[columns].min(axis=1)
    df['max']=df[columns].max(axis=1)
    
    #calculating rolling_mean, rolling_stdev for smart parameters - window 15
    df=rolling_mean_stdev(df)
    
    #calculating expanding_mean, expanding_stdev for smart parameters
    df=expanding_mean_stdev(df)
    
    #exponential smoothing for smart parameters with alpha=0.15
    df=exponential_smoothing(df)

    #model_id features
    df=model_id_features(df)
    
    #serial_number features
    df=serial_number_features(df)
    
    return df

### 3. Final function 1 which takes raw_input values and outputs predictions

In [632]:
def final_fun_1(raw_input):
    
    #pre-processing
    raw_input=preprocessing(raw_input)
    
    #feature engineering
    raw_input=feature_engineering(raw_input)
    
    #get top 50 features of best model
    top_50_features=get_top_50_features()
    test_df_final=raw_input[top_50_features]
    test_df_final_1=test_df_final.as_matrix()
    
    #loading best model
    cal_xgb_model_imp_new = pickle.load(open("cal_xgb_model_imp_new.pickle.dat", "rb"))
    predictions=cal_xgb_model_imp_new.predict(test_df_final_1)
    
    return predictions

### 4. Final function 2 which takes raw_input values, target values and outputs scores

In [633]:
def final_fun_2(raw_input,target_values):
    
    #pre-processing
    raw_input=preprocessing(raw_input)
    
    #feature engineering
    raw_input=feature_engineering(raw_input)
    
    #get top 50 features of best model
    top_50_features=get_top_50_features()
    test_df_final=raw_input[top_50_features]
    test_df_final_1=test_df_final.as_matrix()
    
    #loading best model
    cal_xgb_model_imp_new = pickle.load(open("cal_xgb_model_imp_new.pickle.dat", "rb"))
    predictions=cal_xgb_model_imp_new.predict(test_df_final_1)
    f1Score=f1_score(target_values,predictions)
    precisionScore=precision_score(target_values,predictions)
    recallScore=recall_score(target_values,predictions)
    
    return f1Score,precisionScore,recallScore

### 5. Executing final functions to output predictions, scores

#### 5.1. Calling final_fun_1 to output predictions

In [646]:
predictions=final_fun_1(raw_input)
print("First 100 predictions:\n", predictions[0:100])

First 100 predictions:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [647]:
#printing target values
print("First 100 target_values:\n",np.concatenate(target_values)[0:100])

First 100 target_values:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [648]:
print("Last 100 predictions:\n",predictions[-100:])

Last 100 predictions:
 [1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 1]


In [651]:
print("Last 100 target_values:\n",np.concatenate(target_values)[-100:])

Last 100 target_values:
 [1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


#### 5.2. calling final_fun_2 to output scores

In [650]:
f1Score,precisionScore,recallScore=final_fun_2(raw_input,target_values)
print("f1 score: ",f1Score)
print("precision score: ",precisionScore)
print("recall score: ",recallScore)

f1 score:  0.9609856262833675
precision score:  0.9915254237288136
recall score:  0.9322709163346613


### Summary:

1. Best model is XGBClassifier with top 50 important features.

    __Hyper-Parameters: n_estimators=1000, max_depth=9__
    

2. Selected some working, some failed drives for test input data


3. Printed predicted outputs and target values in section 5.1. We can observe that the model has predicted well.

    __f1 score:  0.9609856262833675__
    
    __precision score:  0.9915254237288136__
    
    __recall score:  0.9322709163346613__