In [1]:
import gc
import glob
import itertools
import os

import lightgbm as lgb
import matplotlib
import matplotlib.pyplot as plt
import numpy as np  
import pandas as pd  
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

%matplotlib inline 
plt.xkcd()

<contextlib._GeneratorContextManager at 0x7f95bc093f98>

## 读取数据

In [2]:
# 辅助 list sort 进行排序
def takeNumber(str):
    return int(str.split('.')[-0][-2:])

In [3]:
def load_data(path, name, columns_name=None):
    all_file_path = glob.glob(os.path.join(path, name))
    all_file_path.sort(key=takeNumber)
    print('file read list : ', all_file_path)
    data = pd.concat((pd.read_csv(f, names=columns_name)
                          for f in all_file_path))
    return data

In [4]:
sensor_feature_train_path = '/home/dell/cutting_tool/train_feature'
plc_feature_train_path = '/usr/yushuyang/cutting_tool/feature/train/plc'
sensor_feature_test_path = '/home/dell/cutting_tool/test_feature'
# plc_feature_test_path = 'feature/train/plc'

In [5]:
time_feature_columns_name = ['ave_spindle', 'std_spindle', 'max_spindle', 'min_spindle', 'median_spindle', 'rmse_spindle',
                             'pp_spindle', 'hdiff_spindle', 'fdiff_spindle', 'ave_x', 'std_x', 'max_x', 'min_x', 'median_x', 'rmse_x',
                             'pp_x', 'hdiff_x', 'fdiff_x', 'ave_y', 'std_y', 'max_y', 'min_y', 'median_y', 'rmse_y', 'pp_y',
                             'hdiff_y', 'fdiff_y', 'ave_z', 'std_z', 'max_z', 'min_z', 'median_z', 'rmse_z', 'pp_z', 'hdiff_z', 'fdiff_z']

In [6]:
sensor_time_train_feature = load_data(sensor_feature_train_path, 'time_*.csv', time_feature_columns_name)
plc_time_train_feature = load_data(plc_feature_train_path, 'time_*.csv')

file read list :  ['/home/dell/cutting_tool/train_feature/time_train01.csv', '/home/dell/cutting_tool/train_feature/time_train02.csv', '/home/dell/cutting_tool/train_feature/time_train03.csv']
file read list :  ['/usr/yushuyang/cutting_tool/feature/train/plc/time_01.csv', '/usr/yushuyang/cutting_tool/feature/train/plc/time_02.csv', '/usr/yushuyang/cutting_tool/feature/train/plc/time_03.csv']


In [7]:
sensor_time_test_feature = load_data(sensor_feature_test_path, 'time_*.csv', time_feature_columns_name)

file read list :  ['/home/dell/cutting_tool/test_feature/time_test01.csv', '/home/dell/cutting_tool/test_feature/time_test02.csv', '/home/dell/cutting_tool/test_feature/time_test03.csv', '/home/dell/cutting_tool/test_feature/time_test04.csv', '/home/dell/cutting_tool/test_feature/time_test05.csv']


In [8]:
print('shape of train sensor time feature : ', sensor_time_train_feature.shape)
print('shape of train plc time feature : ',plc_time_train_feature.shape)

print('shape of test sensor time feature : ', sensor_time_test_feature.shape)

shape of train sensor time feature :  (133, 36)
shape of train plc time feature :  (133, 36)
shape of test sensor time feature :  (50, 36)


In [9]:
train_feature = pd.concat((sensor_time_train_feature, plc_time_train_feature), axis=1)

In [10]:
del sensor_time_train_feature, plc_time_train_feature
gc.collect()

67

## 添加特征

### 频域

In [11]:
freq_feature_columns_name = ['freq_' + name for name in time_feature_columns_name]

In [12]:
sensor_freq_train_feature = load_data(sensor_feature_train_path, 'freq_train*.csv', freq_feature_columns_name)

file read list :  ['/home/dell/cutting_tool/train_feature/freq_train01.csv', '/home/dell/cutting_tool/train_feature/freq_train02.csv', '/home/dell/cutting_tool/train_feature/freq_train03.csv']


In [13]:
print('shape of sensor freq feature : ', sensor_freq_train_feature.shape)

shape of sensor freq feature :  (133, 36)


In [14]:
train_feature = pd.concat((train_feature, sensor_freq_train_feature), axis=1)

In [15]:
del sensor_freq_train_feature
gc.collect()

21

### 小波

In [16]:
wavelet_feature_columns_name = ['wavelet_vibration_1_cA3', 'wavelet_vibration_1_cD3', 'wavelet_vibration_1_cD2', 'wavelet_vibration_1_cD1',
                                'wavelet_vibration_1_cA3_enengy', 'wavelet_vibration_1_cD3_enengy', 'wavelet_vibration_1_cD2_enengy', 'wavelet_vibration_1_cD1_enengy',
                                'wavelet_vibration_1_total_energy', 'wavelet_vibration_2_cA3', 'wavelet_vibration_2_cD3', 'wavelet_vibration_2_cD2', 'wavelet_vibration_2_cD1',
                                'wavelet_vibration_2_cA3_enengy', 'wavelet_vibration_2_cD3_enengy', 'wavelet_vibration_2_cD2_enengy', 'wavelet_vibration_2_cD1_enengy',
                                'wavelet_vibration_2_total_energy', 'wavelet_vibration_3_cA3', 'wavelet_vibration_3_cD3', 'wavelet_vibration_3_cD2', 'wavelet_vibration_3_cD1',
                                'wavelet_vibration_3_cA3_enengy', 'wavelet_vibration_3_cD3_enengy', 'wavelet_vibration_3_cD2_enengy', 'wavelet_vibration_3_cD1_enengy',
                                'wavelet_vibration_3_total_energy', 'wavelet_cA3', 'wavelet_cD3', 'wavelet_cD2', 'wavelet_cD1',
                                'wavelet_cA3_enengy', 'wavelet_current_cD3_enengy', 'wavelet_current_cD2_enengy', 'wavelet_current_cD1_enengy',
                                'wavelet_current_total_energy']

In [17]:
wavelet_train_feature = load_data(sensor_feature_train_path, 'wavelet_train*.csv', wavelet_feature_columns_name)

file read list :  ['/home/dell/cutting_tool/train_feature/wavelet_train01.csv', '/home/dell/cutting_tool/train_feature/wavelet_train02.csv', '/home/dell/cutting_tool/train_feature/wavelet_train03.csv']


In [18]:
print('shape of sensor wavelet feature : ', wavelet_train_feature.shape)

shape of sensor wavelet feature :  (133, 36)


In [19]:
train_feature = pd.concat((train_feature, wavelet_train_feature), axis=1)

In [20]:
del wavelet_train_feature
gc.collect()

21

## 添加标签

In [21]:
train_run_cycle = [48, 48, 37]
test_run_cycle = [10, 10, 10, 10, 10]

In [22]:
train_label = list(itertools.chain.from_iterable(list(range(0, train_run_cycle[i] * 5, 5)[::-1]) for i in range(len(train_run_cycle))))
test_label = list(itertools.chain.from_iterable(list(range(0, test_run_cycle[i] * 5, 5)[::-1]) for i in range(len(test_run_cycle))))

In [23]:
train_data = train_feature.assign(label=train_label)

## 验证集划分

In [24]:
# 随机划分，0.2
train, validation = train_test_split(train_data, test_size=0.20, random_state=0)

In [25]:
train.shape, validation.shape

((106, 145), (27, 145))

# model

In [26]:
# 学习率
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [230]:
def base_model():
    params = {
        'num_leaves': 31,
        'n_estimators' : 100
    }
    clf = lgb.LGBMRegressor(
        learning_rate=0.05, 
        n_jobs=-1, 
        silent=False,
        **params
        )
    clf.fit(
        train.iloc[:, :-1],
        train['label'],
        eval_set=[(train.iloc[:, :-1], train['label']),
                  (validation.iloc[:, :-1], validation['label'])])

In [231]:
base_model()

[1]	valid_0's l2: 4047.61	valid_1's l2: 4409.12
[2]	valid_0's l2: 3776.6	valid_1's l2: 4209.64
[3]	valid_0's l2: 3531.89	valid_1's l2: 4014.58
[4]	valid_0's l2: 3310.01	valid_1's l2: 3858.31
[5]	valid_0's l2: 3109.69	valid_1's l2: 3705.31
[6]	valid_0's l2: 2896.02	valid_1's l2: 3538.97
[7]	valid_0's l2: 2702.1	valid_1's l2: 3389.43
[8]	valid_0's l2: 2547.12	valid_1's l2: 3240.53
[9]	valid_0's l2: 2382.67	valid_1's l2: 3115.01
[10]	valid_0's l2: 2208.67	valid_1's l2: 2973.1
[11]	valid_0's l2: 2059.25	valid_1's l2: 2832.4
[12]	valid_0's l2: 1934.42	valid_1's l2: 2738.9
[13]	valid_0's l2: 1808.94	valid_1's l2: 2620.53
[14]	valid_0's l2: 1706.34	valid_1's l2: 2503.72
[15]	valid_0's l2: 1601.76	valid_1's l2: 2406.63
[16]	valid_0's l2: 1503.37	valid_1's l2: 2318.18
[17]	valid_0's l2: 1440.45	valid_1's l2: 2259.29
[18]	valid_0's l2: 1360.47	valid_1's l2: 2185.74
[19]	valid_0's l2: 1292.85	valid_1's l2: 2120.98
[20]	valid_0's l2: 1221.68	valid_1's l2: 2058.22
[21]	valid_0's l2: 1143.6	valid_1'

## Hyperopt 调参

In [232]:
def params_search(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'n_estimators' : int(params['n_estimators'])
    }
    
    clf = lgb.LGBMRegressor(
        learning_rate=0.05,
        n_jobs=-1,
        **params
    )
    
    score = -cross_val_score(clf, train_data.iloc[:, :-1], train_data['label'], scoring='r2', cv=5).mean()
    print("-R2 {:.3f} params {}".format(score, params))
    return score

In [233]:
space = {
    'num_leaves': hp.quniform('num_leaves', 31, 128, 1),
    'n_estimators': hp.quniform('n_estimators', 25, 500, 25)
}

In [234]:
best = fmin(fn=params_search,
            space=space,
            algo=tpe.suggest,
            max_evals=100)

-R2 0.404 params {'num_leaves': 45, 'n_estimators': 475}
-R2 0.573 params {'num_leaves': 80, 'n_estimators': 50}
-R2 0.402 params {'num_leaves': 77, 'n_estimators': 425}
-R2 0.390 params {'num_leaves': 83, 'n_estimators': 225}
-R2 0.573 params {'num_leaves': 39, 'n_estimators': 50}
-R2 0.387 params {'num_leaves': 117, 'n_estimators': 250}
-R2 0.404 params {'num_leaves': 101, 'n_estimators': 500}
-R2 0.409 params {'num_leaves': 45, 'n_estimators': 125}
-R2 0.390 params {'num_leaves': 62, 'n_estimators': 300}
-R2 0.389 params {'num_leaves': 65, 'n_estimators': 200}
-R2 0.704 params {'num_leaves': 73, 'n_estimators': 25}
-R2 0.390 params {'num_leaves': 61, 'n_estimators': 225}
-R2 0.399 params {'num_leaves': 88, 'n_estimators': 375}
-R2 0.397 params {'num_leaves': 77, 'n_estimators': 350}
-R2 0.389 params {'num_leaves': 72, 'n_estimators': 275}
-R2 0.389 params {'num_leaves': 121, 'n_estimators': 275}
-R2 0.389 params {'num_leaves': 96, 'n_estimators': 200}
-R2 0.386 params {'num_leaves':

In [235]:
print("Hyperopt estimated optimum {}".format(best))

Hyperopt estimated optimum {'n_estimators': 175.0, 'num_leaves': 115.0}


In [237]:
print(params_search(best))

-R2 0.386 params {'num_leaves': 115, 'n_estimators': 175}
0.3856380483282835
