In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import numpy as np
from tsfresh import extract_features, extract_relevant_features, select_features
#from tsfresh.feature_extraction import ComprehensiveFCParameters

  import pandas.util.testing as tm
  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


# https://tsfresh.readthedocs.io/en/latest/text/data_formats.html
### Input Option 1. Flat DataFrame or Wide DataFrame

|id	|time	|x	|y|
|---|---|---|---|
|A	|t1	|x(A, t1)	|y(A, t1)|
|A	|t2	|x(A, t2)	|y(A, t2)|
|A	|t3	|x(A, t3)	|y(A, t3)|
|B	|t1	|x(B, t1)	|y(B, t1)|
|B	|t2	|x(B, t2)	|y(B, t2)|
|B	|t3	|x(B, t3)	|y(B, t3)|

### Input Option 2. Stacked DataFrame or Long DataFrame
### Input Option 3. Dictionary of flat DataFrames


In [2]:
data = pd.read_csv('DB_TIME_HISTORY.csv')
data0 = data.groupby('INST_NAME').get_group('ACTDB11')

In [3]:
#输入某一个实例的数据data，输出处理后的时间与缺失值datanew
def time_miss_fix(data):
    time = data['SNAP_PERIOD']
    time = [i[:15] for i in time]
    time = pd.to_datetime(time,format = '%Y-%m-%d-%H%M')
    time = list(time)
    
    #时间取整
    for i in range(len(time)):
        if time[i].minute>0 and time[i].minute<30:
            time[i] = time[i] - datetime.timedelta(minutes=time[i].minute) #向下
            if time[i] == time[i-1]:
                time[i] = time[i] + datetime.timedelta(minutes=30) #向上
        elif time[i].minute>30:
            time[i] = time[i] - datetime.timedelta(minutes=(time[i].minute-30))
            if time[i] == time[i-1]:
                time[i] = time[i] + datetime.timedelta(minutes=30)
    
    time_new = pd.date_range(min(time),max(time),freq='1800s')
    datanew = pd.DataFrame(index=time_new,columns=['DB_TIME'])  #生成完整的时间
    datanew.loc[time,'DB_TIME']=list(data['DB_TIME'])                               #根据时间填充取值
    datanew.loc[datanew['DB_TIME']<0,'DB_TIME']=np.nan                     #把负值当成缺失值
    
    datanew.loc[datanew['DB_TIME']>100,'DB_TIME']=np.nan                #把大于100的当成缺失值
    
    return datanew

In [4]:
series = time_miss_fix(data0).DB_TIME

In [5]:
#把时间序列数据转换为监督学习数据,n_in是X的维度,n_out是Y的维度
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    #n_vars = 1 if type(data) is list else data.shape[0]
    df = pd.DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)  自变量X
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)  因变量Y
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # put it all together  合并X和Y
    agg = pd.concat(cols, axis=1)
    # drop rows with NaN values  只要有缺失值，就删除那个样本
    if dropnan:
        agg.dropna(inplace=True)
    
    X = agg.values[:, :n_in]
    Y = agg.values[:,n_in:]
    
    return X, Y

In [6]:
dataX, dataY = series_to_supervised(series,48,1)

In [7]:
dataX.shape

(33932, 48)

In [8]:
df_dbtime = dataX.reshape(-1)
df_id = np.repeat(range(dataX.shape[0]),dataX.shape[1])
df_time = np.array(list(range(0,dataX.shape[1]))*dataX.shape[0])
df = np.array([df_dbtime,df_id,df_time]).T

In [9]:
df = pd.DataFrame(df,columns=['DB_TIME','id','time'])
df['DB_TIME'] = [np.float(i) for i in df['DB_TIME']] #好像要变成numpy格式的数值才能提取特征

In [10]:
df #共33932个子序列

Unnamed: 0,DB_TIME,id,time
0,19.55,0,0
1,36.33,0,1
2,28.35,0,2
3,31.55,0,3
4,33.29,0,4
...,...,...,...
1628731,21.02,33931,43
1628732,8.73,33931,44
1628733,6.86,33931,45
1628734,5.98,33931,46


In [11]:
y = pd.Series(dataY.reshape(-1)) #y的id要与df对应
y

0        15.92
1         15.4
2        13.83
3        21.87
4        13.83
         ...  
33927     8.73
33928     6.86
33929     5.98
33930     5.58
33931     5.23
Length: 33932, dtype: object

In [12]:
#extraction_settings = ComprehensiveFCParameters() #可以设置要提取的特征
#X_feature = extract_features(df_new,column_id='id', column_sort='time') #提取特征，根据id聚合

#直接根据y提取过滤后的特征
X_filtered = extract_relevant_features(df, y, column_id='id', column_sort='time')

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 20/20 [09:42<00:00, 17.39s/it]


KeyboardInterrupt: 

In [None]:
print(X_filtered.shape)
X_filtered

In [5]:
datanew = time_miss_fix(data0)

In [6]:
datanew

Unnamed: 0,DB_TIME
2019-01-01 00:00:00,19.55
2019-01-01 00:30:00,36.33
2019-01-01 01:00:00,28.35
2019-01-01 01:30:00,31.55
2019-01-01 02:00:00,33.29
...,...
2020-12-21 11:30:00,8.73
2020-12-21 12:00:00,6.86
2020-12-21 12:30:00,5.98
2020-12-21 13:00:00,5.58


In [7]:
datanew['id'] = [str(i)[:10] for i in datanew.index]

In [8]:
datanew['time'] = list(range(0,48))*(len(datanew)//48)+list(range(len(datanew)%48))

In [9]:
datanew

Unnamed: 0,DB_TIME,id,time
2019-01-01 00:00:00,19.55,2019-01-01,0
2019-01-01 00:30:00,36.33,2019-01-01,1
2019-01-01 01:00:00,28.35,2019-01-01,2
2019-01-01 01:30:00,31.55,2019-01-01,3
2019-01-01 02:00:00,33.29,2019-01-01,4
...,...,...,...
2020-12-21 11:30:00,8.73,2020-12-21,23
2020-12-21 12:00:00,6.86,2020-12-21,24
2020-12-21 12:30:00,5.98,2020-12-21,25
2020-12-21 13:00:00,5.58,2020-12-21,26


In [10]:
datanew['DB_TIME'] = datanew['DB_TIME'].fillna(datanew['DB_TIME'].mean())

In [11]:
#特征提取（每一段子序列的特征）
#extraction_settings = ComprehensiveFCParameters() #可以设置要提取的特征
X = extract_features(datanew,column_id='id', column_sort='time') #根据id聚合

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 20/20 [00:27<00:00,  1.30it/s]


In [12]:
print(X.shape)
X

(721, 784)


Unnamed: 0,DB_TIME__variance_larger_than_standard_deviation,DB_TIME__has_duplicate_max,DB_TIME__has_duplicate_min,DB_TIME__has_duplicate,DB_TIME__sum_values,DB_TIME__abs_energy,DB_TIME__mean_abs_change,DB_TIME__mean_change,DB_TIME__mean_second_derivative_central,DB_TIME__median,...,DB_TIME__fourier_entropy__bins_2,DB_TIME__fourier_entropy__bins_3,DB_TIME__fourier_entropy__bins_5,DB_TIME__fourier_entropy__bins_10,DB_TIME__fourier_entropy__bins_100,DB_TIME__permutation_entropy__dimension_3__tau_1,DB_TIME__permutation_entropy__dimension_4__tau_1,DB_TIME__permutation_entropy__dimension_5__tau_1,DB_TIME__permutation_entropy__dimension_6__tau_1,DB_TIME__permutation_entropy__dimension_7__tau_1
2019-01-01,1.0,0.0,0.0,0.0,957.54,22727.3366,6.418511,-0.183617,-0.185652,17.075,...,0.551080,0.848677,1.090778,1.721678,3.087042,1.677354,2.663142,3.445339,3.696721,3.737670
2019-01-02,1.0,0.0,0.0,1.0,538.20,7074.6364,2.511915,-0.193617,-0.010652,9.685,...,0.278769,0.334221,0.498758,0.970403,2.300611,1.728977,2.643565,3.319312,3.620074,3.671656
2019-01-03,1.0,0.0,0.0,1.0,487.59,6462.5337,4.391915,-0.437021,0.170978,8.850,...,0.366925,0.962268,1.321164,1.812368,3.107972,1.747801,2.799534,3.362711,3.587835,3.671656
2019-01-04,1.0,0.0,0.0,0.0,422.96,4191.5418,1.852979,-0.153404,0.010978,7.775,...,0.366925,0.443307,0.443307,0.711386,2.054389,1.738888,2.802966,3.476845,3.696721,3.737670
2019-01-05,1.0,0.0,0.0,1.0,409.73,3827.3025,1.747660,-0.085957,-0.018261,8.175,...,0.278769,0.707786,1.028624,1.351309,2.788854,1.765691,2.947554,3.626656,3.761200,3.737670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-17,1.0,0.0,0.0,1.0,319.97,2297.3335,1.377447,-0.125957,-0.009891,6.555,...,0.278769,0.550573,1.108572,1.561577,2.788854,1.689990,2.761176,3.413832,3.664482,3.704663
2020-12-18,1.0,0.0,0.0,1.0,362.16,3174.8344,2.299787,-0.041064,-0.020435,6.970,...,0.551080,0.917357,1.408813,2.065212,2.997069,1.747801,2.890017,3.413832,3.696721,3.737670
2020-12-19,1.0,0.0,0.0,0.0,328.00,2500.2560,1.275745,-0.074894,-0.008043,6.055,...,0.439670,0.635005,0.895071,1.278064,2.809783,1.707877,2.673544,3.242174,3.600003,3.704663
2020-12-20,1.0,0.0,0.0,1.0,323.21,2435.6943,1.722766,-0.078511,-0.002174,5.960,...,0.167944,0.529644,0.793817,1.308213,2.633321,1.746517,2.680384,3.350819,3.632243,3.737670


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 721 entries, 2019-01-01 to 2020-12-21
Columns: 784 entries, DB_TIME__variance_larger_than_standard_deviation to DB_TIME__permutation_entropy__dimension_7__tau_1
dtypes: float64(784)
memory usage: 4.3+ MB


In [15]:
#特征过滤，根据预测值y来提取最相关的特征
y = pd.Series(datanew.loc[datanew['time']==0,'DB_TIME'],index=[str(i)[:10] for i in X.index])

In [16]:
y

2019-01-01    19.55
2019-01-02    15.92
2019-01-03    28.21
2019-01-04    13.37
2019-01-05    10.88
              ...  
2020-12-17     9.29
2020-12-18     6.88
2020-12-19     8.32
2020-12-20     8.38
2020-12-21     8.28
Name: DB_TIME, Length: 721, dtype: float64

In [17]:
#特征过滤
X_filtered = extract_relevant_features(datanew, y, column_id='id', column_sort='time')
print(X_filtered.shape)
X_filtered

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 20/20 [00:28<00:00,  1.04it/s]


(721, 278)


Unnamed: 0,"DB_TIME__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)","DB_TIME__cwt_coefficients__coeff_1__w_20__widths_(2, 5, 10, 20)","DB_TIME__cwt_coefficients__coeff_2__w_20__widths_(2, 5, 10, 20)","DB_TIME__cwt_coefficients__coeff_3__w_20__widths_(2, 5, 10, 20)","DB_TIME__cwt_coefficients__coeff_4__w_20__widths_(2, 5, 10, 20)","DB_TIME__cwt_coefficients__coeff_5__w_20__widths_(2, 5, 10, 20)","DB_TIME__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)","DB_TIME__linear_trend_timewise__attr_""intercept""","DB_TIME__linear_trend__attr_""intercept""",DB_TIME__time_reversal_asymmetry_statistic__lag_2,...,"DB_TIME__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.2","DB_TIME__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","DB_TIME__fft_coefficient__attr_""real""__coeff_7","DB_TIME__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.0","DB_TIME__fft_aggregated__aggtype_""variance""","DB_TIME__fft_aggregated__aggtype_""skew""","DB_TIME__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","DB_TIME__fft_coefficient__attr_""abs""__coeff_9",DB_TIME__index_mass_quantile__q_0.8,"DB_TIME__fft_coefficient__attr_""real""__coeff_14"
2019-01-01,52.255022,56.046387,58.572763,62.386184,66.006701,68.535659,72.039106,24.442092,24.442092,-1583.650928,...,1.498261,1.142381,-2.285534,5.112504,54.836819,0.756761,0.415372,45.863203,0.750000,-24.445440
2019-01-02,36.266985,38.715427,41.004912,43.244191,45.368117,47.326876,49.140724,15.845561,15.845561,-277.285311,...,0.485485,0.649167,18.308899,1.722159,45.831545,1.303206,0.239024,27.381165,0.750000,-10.285762
2019-01-03,27.790206,29.695661,31.951014,33.967417,36.122987,38.092114,39.903085,13.543597,13.543597,-298.951489,...,2.712265,1.693810,25.482599,8.340993,66.433882,0.686264,0.771865,28.470036,0.750000,40.914892
2019-01-04,26.865565,28.659286,30.411393,32.068436,33.604709,35.056584,36.362753,11.557330,11.557330,-119.762448,...,0.364202,0.737200,0.397487,1.005067,65.370884,1.503610,0.098852,6.995899,0.791667,4.516097
2019-01-05,23.549264,25.152855,26.684745,28.254357,29.790161,31.145382,32.495013,10.112509,10.112509,-64.246823,...,1.314873,0.789000,-3.725836,2.568153,49.717390,1.475056,0.594605,26.357274,0.812500,6.147681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-17,17.113184,18.498225,19.894791,21.246032,22.556308,23.682090,24.780375,8.274116,8.274116,-51.035218,...,0.799729,0.958333,1.136389,2.114775,56.872811,1.298065,0.335729,10.588441,0.750000,0.220067
2020-12-18,16.255600,18.004962,19.797806,21.559166,23.305513,24.860727,26.398532,8.562321,8.562321,-40.962751,...,0.929124,1.014762,-15.134534,3.001668,65.985087,1.001216,0.245541,18.395570,0.750000,0.837100
2020-12-19,17.829323,19.188108,20.547259,21.865547,23.137003,24.203313,25.173810,8.413639,8.413639,-50.770778,...,0.510433,0.592800,3.936205,2.194752,47.090593,1.687410,0.177046,11.793249,0.770833,3.983189
2020-12-20,18.299108,19.670546,21.053421,22.416708,23.752858,24.932260,26.049226,8.812585,8.812585,-45.740302,...,0.332699,1.012400,-5.503454,3.009390,64.138794,1.373829,0.162517,7.271272,0.729167,-4.072315


In [18]:
X_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 721 entries, 2019-01-01 to 2020-12-21
Columns: 278 entries, DB_TIME__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20) to DB_TIME__fft_coefficient__attr_"real"__coeff_14
dtypes: float64(278)
memory usage: 1.5+ MB


In [None]:
X_train,X_test,X_filtered_train,X_filtered_test,y_train,y_test = train_test_split(X,X_filtered,y,test_size=0.3)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print(classification_report(y_test, dtc.predict(X_test)))
dtc.n_features_

In [None]:
dtc2 = DecisionTreeClassifier()
dtc2.fit(X_filtered_train, y_train)
print(classification_report(y_test, dtc2.predict(X_filtered_test)))
dtc2.n_features_