In [None]:
import pandas as pd
import numpy as np
import gc
from scipy import interpolate
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.model_selection import cross_val_score
from lightgbm.sklearn import LGBMClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from datetime import datetime, timedelta
from sklearn.metrics import plot_roc_curve,roc_curve,auc,roc_auc_score
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [None]:
#reduce memory
def reduce_memory(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2 #Total data memory
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df



#Process the record timestamp of data, get the day, year, and month, and convert ts to time format
def process_dt(data):
    data = reduce_memory(data) #reduce memory
    data.dt=data.dt.apply(lambda x :datetime(x//10000,(x//100)%100,x%100).strftime('%Y-%m-%d'))
    data['dt'] = pd.to_datetime(data.dt)
    data = data.sort_values('dt').reset_index(drop = True) #Sort by sampling time
    return data


#calculate the init state change features
def init_change(df):
    # calculate the disk usage length
    df['server_time'] = (df['dt'] - df['init_dt']).dt.days
    diff_init = ['smart_7_normalized','smart_187raw','smart_199raw','smart_188_normalized','smart_195_normalized'] #变化特征列
    
    #calculate the change value
    for col in diff_init:
        df['init_diff_' + col] = df[col] - df['init_' + col]
        df.drop('init_' + col, axis = 1, inplace = True)
        
    gc.collect() #release memory
    return df
    
    
    
# labelling
def get_label(df, fault):
    key_col = ["manufacturer", "model", "serial_number", "dt"]
    fault["tag"] = 1
    fault = fault.drop_duplicates()
    all_fault = pd.DataFrame()
    for i in range(-1, 8):
        tmp_fault = fault.copy()
        tmp_fault["fault_time"] -= timedelta(days=i)
        all_fault = all_fault.append(tmp_fault, ignore_index=True)
    all_fault = all_fault.drop_duplicates()
    
    df = df.merge(all_fault, on=["model", "serial_number"], how="left")
    df["tag"] = df["tag"].fillna(0).astype(int)
    df = df.reset_index(drop=True)

    del df['fault_time']
    return df
    
    
# Remove features with a null value greater than 80% and nunique = 1 and fill nan with mean
def drop_null_nunique(df):
    nullCols = []
    nuni = []
    for i in tqdm([col for col in df.columns if col not in ['Unnamed: 0','serial_number','manufacturer','model']]):
        nan = np.isnan(df[i]).mean()
        if nan*100 >= 80:
            nullCols.append(i)
    df.drop(df[nullCols], axis=1, inplace=True)
    
    for i in tqdm([col for col in df.columns if col not in ['Unnamed: 0','serial_number','manufacturer','model']]):
        df[i].fillna(df[i].mean(), inplace=True)
        
    for i in tqdm([col for col in df.columns if col not in ['Unnamed: 0','serial_number','manufacturer','model']]):
        same = df[i].nunique()
        if same == 1:
            nuni.append(i)
    df.drop(df[nuni], axis=1, inplace=True)
        
    return df

    
    
# Collect data for March, April, May, June, and July, process time and fill nan value
def generate_data():
    #Collect data for March, April, May, June, and July
    names = ['disk_sample_smart_log_201803.csv','disk_sample_smart_log_201804.csv',
             'disk_sample_smart_log_201805.csv', 'disk_sample_smart_log_201806.csv',
             'disk_sample_smart_log_201807.csv']
    
    names = ['/Users/yangxianjie/PycharmProjects/pythonProject/disk/raw_data/' + filename for filename in names]
    paths = names
    
    chosen_col = ['serial_number', 'manufacturer', 'model', 'smart_5raw', 'smart_5_normalized', 'smart_197raw', 'smart_197_normalized', 'smart_198raw', 'smart_187_normalized', 'smart_198_normalized', 'smart_7_normalized', 'smart_187raw', 'smart_199raw', 'smart_188_normalized', 'smart_195_normalized', 'dt']
    
    data_34567 = pd.DataFrame()
    for path in paths:
        data = pd.read_csv(path) #read data
        data = data[chosen_col] #Read specified characteristics
        data = drop_null_nunique(data) #Remove the features whose null value is greater than 80% and nunique is 1, and read the specified feature
        data = process_dt(data) #process time
        data_34567 = data_34567.append(data).reset_index(drop = True) #Add the processed data to the data table
        
    data_34567 = data_34567.sort_values('dt').reset_index(drop = True) #Sort the data for the month of 34567 according to the sampling time
    
    print('save data to  /user_data/cube_data')
    data_34567.to_pickle('/Users/yangxianjie/测试项目/AI/cube_data/data_34567_12.pkl')
    del data_34567
    gc.collect()

   
   
# Generate serial file, record the earliest log of all disks
def generate_serial():
    # Collect data for all months
    names = ['disk_sample_smart_log_201707.csv', 'disk_sample_smart_log_201708.csv', 'disk_sample_smart_log_201709.csv'
        , 'disk_sample_smart_log_201710.csv', 'disk_sample_smart_log_201711.csv', 'disk_sample_smart_log_201712.csv'
        , 'disk_sample_smart_log_201801.csv', 'disk_sample_smart_log_201802.csv', 'disk_sample_smart_log_201803.csv'
        , 'disk_sample_smart_log_201804.csv', 'disk_sample_smart_log_201805.csv', 'disk_sample_smart_log_201806.csv'
        , 'disk_sample_smart_log_201807.csv']
    
    names = ['/Users/yangxianjie/PycharmProjects/pythonProject/disk/raw_data/' + filename for filename in names]
    paths = names
    
    serial = pd.DataFrame()
    init_cols = ['smart_7_normalized','smart_187raw','smart_199raw','smart_188_normalized','smart_195_normalized','dt'] #Select some features
    
    #Splice features according to specified features and record the earliest log
    for path in paths:
        #print(path)
        datas = pd.read_csv(path, chunksize = 500000)
        for data in datas:
            data = data[['serial_number', 'model'] + init_cols].sort_values('dt').drop_duplicates(['serial_number', 'model'])
            serial = pd.concat((serial, data), axis = 0)
            serial = serial.sort_values('dt').drop_duplicates(['serial_number', 'model']).reset_index(drop = True)
            
    serial = process_dt(serial)
    
    serial.rename(columns = dict(zip(init_cols, ['init_' + col for col in init_cols])), inplace = True) #Rename feature column
    
    serial.to_csv('/Users/yangxianjie/测试项目/AI/serial_col_12.csv', index=None)

In [None]:
if __name__ == '__main__':
    #Generate the interpolated data set for the month of 34567
    pd.DataFrame().dropna()
    print("###### generate data ######")
    generate_data()
    print('###### generate serial ######')
    generate_serial()
    
    # read data
    print('###### read data ######')
    data_34567 = pd.read_pickle('/Users/yangxianjie/测试项目/AI/cube_data/data_34567_12.pkl')
    
    #read tag
    print('###### read tag ######')
    tag = pd.read_csv('/Users/yangxianjie/MSc Project/Dataset/disk_sample_fault_tag.csv')
    tag['fault_time'] = pd.to_datetime(tag['fault_time']) #change time format
    
    #Some disks in the tag table have several failures on the same day
    tag['tag'] = tag['tag'].astype(str)
    tag = tag.groupby(['serial_number', 'fault_time', 'model'])['tag'].apply(lambda x: '|'.join(x)).reset_index() #Failure of spliced disks on the same day
    
    # read serial
    print('###### read serial ######')
    serial = pd.read_csv('/Users/yangxianjie/测试项目/AI/serial_col_12.csv')
    serial.init_dt = pd.to_datetime(serial.init_dt)
    
    
    #calculate the init state change features
    print('###### calculate Initial change feature ######')
    data_34567 = data_34567.merge(serial, how = 'left', on = ['serial_number', 'model']) #Merged with serial table
    data_34567 = init_change(data_34567)
    
    # Label and screen samples at the same time
    print('###### tag label ######')
    data_34567 = get_label(data_34567, tag)
    
    # process serial_number
    data_34567['serial_number'] = data_34567['serial_number'].apply(lambda x: int(x.split('_')[1]))
    
    print('###### save data for train ######')
    data_34567.to_pickle('/Users/yangxianjie/测试项目/AI/cube_data/process_data_34567_12_feature_engineering.pkl')

In [None]:
# read training data
data_34567 = pd.read_pickle('/Users/yangxianjie/测试项目/AI/cube_data/process_data_34567_12_feature_engineering.pkl')

# training features list
smartcol = [col for col in data_34567.columns if col not in ['log_cumsum', 'Unnamed: 0', 'serial_number', 'model', 'dt', 'fault_time', 'tag', 'label', 'gap_bad_day', 'manufacturer', 'init_dt']]
print('training features : ', smartcol)
print(len(smartcol))

data_34567 = data_34567.loc[data_34567['model'] == 1]

# split the training dataset and test dataset
train_x = data_34567.loc[data_34567['dt'].dt.month < 5]

for i in smartcol:
    train_x[i].fillna(0, inplace=True)
print(train_x.isnull().sum())
train_y = data_34567.loc[data_34567['dt'].dt.month < 5, 'tag']

health = len(train_x[train_x['tag'] == 0])
print('health disks = ', health)
faild = len(train_x[train_x['tag'] == 1])
print('faild disks = ', faild)

val_x = data_34567.loc[data_34567['dt'].dt.month == 6]
for i in smartcol:
    val_x[i].fillna(0, inplace=True)
print(val_x.isnull().sum())
val_y = data_34567.loc[data_34567['dt'].dt.month == 6, 'tag']

del data_34567
gc.collect()


def up_sampling(train_x, train_y, faild, ratio=3):
    pos_sap_num = int(faild * ratio) 
    smo = BorderlineSMOTE(sampling_strategy={1: pos_sap_num}, random_state=1)
    train_x, train_y = smo.fit_resample(train_x, train_y)

    return train_x, train_y


def down_sampling(train_x, train_y, health, ratio=10):
    neg_sap_num = int(health / ratio)
    rus = RandomUnderSampler(sampling_strategy={0: neg_sap_num}, random_state=1)
    train_x, train_y = rus.fit_resample(train_x, train_y)
    return train_x, train_y

In [None]:
train_x, train_y = up_sampling(train_x[smartcol], train_y, faild, 3)
train_x, train_y = down_sampling(train_x, train_y, health, 10)

In [None]:
train_x.to_csv('/Users/yangxianjie/测试项目/AI/cube_data/train_x_12_feature_engineering.csv')
train_y.to_csv('/Users/yangxianjie/测试项目/AI/cube_data/train_y_12_feature_engineering.csv')

In [None]:
val_x[smartcol].to_csv('/Users/yangxianjie/测试项目/AI/cube_data/valid_x_12_feature_engineering.csv')
val_y.to_csv('/Users/yangxianjie/测试项目/AI/cube_data/valid_y_12_feature_engineering.csv')