In [None]:
#'cffex-host-alarm.csv' process code
#Author: 普俊韬
#Last update: 20180626

#导入pandas、numpy库
import pandas as pd
import numpy as np


#TODO：进行原始数据的分割处理

#读取原始数据（GBK编码，数据不带列标题）
data = pd.read_csv('raw_data\cffex-host-alarm.csv',encoding = 'GBK', header = None)

#分割数据字段，分隔符'|||'（'[|]+'),分割后扩展列
data_processed = data[0].str.split('[|]+',expand = True)

#插入列标题
data_processed.columns = ['node_name', 'node_alias', 'component', 'category', 'alarm_count', 'first_time', 'last_time', 'alarm_level', 'alarm_content']


#TODO：进行'component'字段的处理

#将'component'字段提取出来作为一个DataFrame
data_component = data_processed.loc[:,['component']]

#去掉重复数据
data_component_processed = data_component.drop_duplicates()

#插入id列，编号从1开始
data_component_processed['id'] = range(1,len(data_component_processed) + 1)

#将列顺序调整为['id', 'component']
data_component_processed = data_component_processed[['id','component']]

#将处理后结果写入'cffex-host-alarm-component.csv'（不带行标签，GBK编码）
data_component_processed.to_csv('output_data\cffex-host-alarm-component.csv', index = 0, encoding = 'GBK')


#TODO：进行'category'字段的处理

#将'category'字段提取出来作为一个DataFrame
data_category = data_processed.loc[:,['category']]

#去掉重复数据
data_category_processed = data_category.drop_duplicates()

#插入id列，编号从1开始
data_category_processed['id'] = range(1,len(data_category_processed) + 1)

#将列顺序调整为['id', 'category']
data_category_processed = data_category_processed[['id','category']]

#将处理后结果写入'cffex-host-alarm-category.csv'（不带行标签，GBK编码）
data_category_processed.to_csv('output_data\cffex-host-alarm-category.csv', index = 0, encoding = 'GBK')


#TODO：将'component'和'category'字段替换为对应的'id'值，方便后续的数据处理

#对'component'字段进行查找和替换
data_processed['component'] = data_processed['component'].replace(data_component_processed['component'].tolist(),data_component_processed['id'].tolist())

#对'category'字段进行查找和替换
data_processed['category'] = data_processed['category'].replace(data_category_processed['category'].tolist(),data_category_processed['id'].tolist())

#将处理后结果写入'cffex-host-alarm-processed.csv'（不带行标签，GBK编码）
data_processed.to_csv('output_data\cffex-host-alarm-processed.csv', index = 0, encoding = 'GBK')



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf,pacf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


rcParams['figure.figsize'] = 15,6

dateparse = lambda dates: pd.datetime.strptime(dates,'%Y%m%d%H')

data = pd.read_csv(r'C:\Users\silen\Github\Cffex_Data_Analysis\output_data\cffex-host-info\2018_config3_hourly_cpu.csv',
                   encoding='UTF-8',parse_dates=['archour'],index_col = 'archour',date_parser=dateparse)

ts_maxvalue = data['maxvalue']
ts_maxvalue_log = np.log(ts_maxvalue)


# 稳定性检测
def test_stationarity(timeseries):
    rolmean = timeseries.rolling(window = 168).mean()
    rolstd = timeseries.rolling(window = 168).std()
    
    fig = plt.figure()
    fig.add_subplot()
    orig = plt.plot(timeseries, color = 'blue', label = 'Original')
    mean = plt.plot(rolmean, color = 'red', label = 'rolling mean')
    std = plt.plot(rolstd,color = 'black', label = 'Rolling standard deviation')
    
    plt.legend(loc = 'best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    
    #dickey-Fuller test:
    
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries,autolag='AIC')
    
    dfoutput = pd.Series(dftest[0:4],index = ['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical value (%s)'%key] = value
        
    print (dfoutput)
        
        
        
# 时间序列分解
def decompose(timeseries):
    decomposition = seasonal_decompose(timeseries,freq=168)
    
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid
    
    plt.subplot(411)
    plt.plot(ts_maxvalue_log,label='Original')
    plt.legend(loc='best')
    plt.subplot(412)
    plt.plot(trend,label='Trend')
    plt.legend(loc='best')
    plt.subplot(413)
    plt.plot(seasonal,label='Seasonality')
    plt.legend(loc='best')
    plt.subplot(414)
    plt.plot(residual,label='Residuals')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()
    
    return trend,seasonal,residual

# 时间序列分解调用
# trend,seasonal,residual = decompose(ts_maxvalue_log)
# residual.dropna(inplace=True)
# test_stationarity(residual)


# 自相关、偏自相关图
def ACF_PACF(timeseries):
    lag_acf = acf(timeseries, nlags = 20)
    lag_pacf = pacf(timeseries, nlags = 20, method='ols')

    plt.subplot(121)
    plt.plot(lag_acf)
    plt.axhline(y=0,linestyle='--',color='gray')
    plt.axhline(y=-1.96/np.sqrt(len(timeseries)),linestyle='--',color='gray')
    plt.axhline(y=1.96/np.sqrt(len(timeseries)),linestyle='--',color='gray')
    plt.title('Autocorrelation Function')

    plt.subplot(122)
    plt.plot(lag_pacf)
    plt.axhline(y=0,linestyle='--',color='gray')
    plt.axhline(y=-1.96/np.sqrt(len(timeseries)),linestyle='--',color='gray')
    plt.axhline(y=1.96/np.sqrt(len(timeseries)),linestyle='--',color='gray')
    plt.title('Partial Autocorrelation Function')
    plt.tight_layout()
    
    plt.show()


# ARIMA模型
def ARIMA_Pre(timeseries,order):
    model = ARIMA(timeseries,order)
    results_ARIMA= model.fit(disp=-1)
    plt.plot(timeseries, color = 'blue')
    plt.plot(results_ARIMA.fittedvalues,color='red')
    plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-timeseries)**2))
    plt.show(block=False)
    
    return results_ARIMA.fittedvalues

# MA模型
def MA(timeseries, ordered_window):
    moving_avg = timeseries.rolling(window = ordered_window).mean()
    plt.plot(timeseries, color = 'blue')
    plt.plot(moving_avg, color = 'red')
    plt.title('Moving Average')
    plt.show(block=False)
    return moving_avg

# ES模型
def ES(alpha, timeseries):
    ES_TS = timeseries.copy(deep=True)
    ES_TS[ES_TS!=0] = 0.0
    for i in range(1, len(ES_TS)):
        ES_TS[i] = alpha*timeseries[i]+(1-alpha)*ES_TS[i-1]
    
    plt.plot(ts_maxvalue_log, color = 'blue')
    plt.plot(ES_TS, color = 'red')
    plt.title('Exponential Smoothing')
    plt.show(block=False)
    return ES_TS

    
# 回归预测评价
def evaluate(timeseries,results):
    print('MSE: %.4f'% mean_squared_error(timeseries,results))
    print('RMSE: %.4f'% (mean_squared_error(timeseries,results))**0.5)
    print('MAE: %.4f'% mean_absolute_error(timeseries,results))
    print('R-Square: %.4f'% r2_score(timeseries,results))
    

# 稳定性检测
test_stationarity(ts_maxvalue_log)

# ACF-PACF
print('ACF-PACF:')
ACF_PACF(ts_maxvalue_log)

# 时间序列分解
print('Time Series Decompose:')
decompose(ts_maxvalue_log)

# MA结果
print('MA(Moving Average):')
MA_results = MA(ts_maxvalue_log, 24)
nan = np.isnan(MA_results)
MA_results[nan] = 0
evaluate(ts_maxvalue_log, MA_results)

# ARIMA结果
print('ARIMA:')
ARIMA_results = ARIMA_Pre(ts_maxvalue_log,(1,0,1))
evaluate(ts_maxvalue_log, ARIMA_results)

# ES结果
print('ES[1]:')
alpha = .70#设置alphe，即平滑系数
ES_results = ES(alpha,ts_maxvalue_log)
evaluate(ts_maxvalue_log, ES_results)
print('ES[2]:')
ES2_results = ES(alpha, ES_results)
evaluate(ts_maxvalue_log, ES2_results)