In [1]:
import numpy as np
from glob import glob
import pickle
import pandas as pd
import random
from tqdm import tqdm
from collections import OrderedDict
from sklearn import metrics
from sklearn.metrics import auc

In [2]:
data_path='./data/Train'#存放数据的路径
pkl_files = glob(data_path+'/*.pkl')
ind_pkl_files = []#存放标签为0的文件
ood_pkl_files = []#存放标签为1的文件
for each_path in tqdm(pkl_files):
    pic = open(each_path,'rb')
    this_pkl_file= pickle.load(pic)#下载pkl文件
    if this_pkl_file[1]['label'] == '00':
        ind_pkl_files.append(each_path)
    else:
        ood_pkl_files.append(each_path)

all_pkl_files=ind_pkl_files+ood_pkl_files

random.seed(0)
#排序并打乱存放车辆序号的集合
random.shuffle(all_pkl_files)

100%|██████████| 28389/28389 [00:01<00:00, 16193.22it/s]


In [15]:
def  load_data(pkl_list,label=True):
    '''
    输入pkl的列表，进行文件加载
    label=True用来加载训练集
    label=False用来加载真正的测试集，真正的测试集无标签
    '''
    X = []
    y = []
    

    for  each_pkl in pkl_list:
        pic = open(each_pkl,'rb')
        item= pickle.load(pic)#下载pkl文件
        # 此处选取的是每个滑窗的最后一条数据，仅供参考，可以选择其他的方法，比如均值或者其他处理时序数据的网络
        # 此处选取了前7个特征，可以需求选取特征数量
        X.append(item[0][:,0:7][-1])
        if label:
            y.append(int(item[1]['label'][0]))
    X = np.vstack(X)
    if label:
        y = np.vstack(y)
    return X, y

In [88]:
train_pkl_files=[]

for i in range(int(len(all_pkl_files)*0.9)):
    train_pkl_files.append(all_pkl_files[i])
test_pkl_files=[]
for j in range(int(len(all_pkl_files)*0.9),len(all_pkl_files)):
    test_pkl_files.append(all_pkl_files[j])

X_train,y_train=load_data(train_pkl_files)
X_test,y_test=load_data(test_pkl_files)

In [89]:
# 归一化
_mean = np.mean(X_train, axis=0)
_std = np.std(X_train, axis=0)
X_train = (X_train - _mean) / (_std + 1e-4)
X_test = (X_test - _mean) / (_std + 1e-4)
y_train=y_train.ravel()
y_test=y_test.ravel()

In [6]:
def evaluate(label,score):
    fpr, tpr, thresholds = metrics.roc_curve(label, score, pos_label=1)
    AUC = auc(fpr, tpr)
    return AUC

In [7]:
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import cross_validate,KFold

In [29]:
import hyperopt
from hyperopt import fmin,tpe,hp,Trials
from hyperopt.early_stop import no_progress_loss

In [90]:
#benchmark 
clf=GBC(random_state=4869)
clf.fit(X,y)
y=y.ravel()
cv=KFold(n_splits=5,shuffle=True,random_state=4869)
cross_validate(clf,X_train,y_train,cv=cv)['test_score'].mean()

0.8962035225048923

In [91]:
score=clf.predict_proba(X_test)[:,1]
evaluate(y_test,score)

0.9313168173974065

In [92]:
def hyperopt_objective(params):
    clf=GBC(n_estimators=int(params['n_estimators'])
            ,learning_rate=params['learning_rate']
            ,loss = params['loss']
            ,max_depth=int( params['max_depth'])
            ,subsample=params['subsample']
            ,max_features=params['max_features']
            ,min_impurity_decrease=params['min_impurity_decrease']
            ,random_state=4869,
            verbose=False
            )
    cv=KFold(n_splits=5,shuffle=True,random_state=4869)
    err=cross_validate(clf,X_train,y_train,cv=cv)['test_score'].mean()
    return err

In [85]:
def param_hyperopt(max_eval=100):
    trials=Trials()
    early = no_progress_loss(100)
    params_best = fmin(hyperopt_objective,
                  space=para_grids,
                  algo = tpe.suggest,
                  verbose=True,
                  max_evals=max_eval,
                  trials=trials,
                  early_stop_fn = early
                  )
    print("-------"+'\n'+"best parameters:",params_best)
    return params_best,trials

In [95]:
para_grids = {
    'n_estimators':hp.quniform('n_estimators',25,200,25)
            ,'learning_rate':hp.quniform('learning_rate',0.05,2.05,0.05)
            ,'loss': hp.choice('loss',["deviance","exponential"])
            ,'max_depth' : hp.quniform('max_depth',2,10,2)
            ,'subsample':hp.quniform('subsample',0.1,0.8,0.1)
            ,'max_features':hp.choice('max_features',[3,4,5,6,7])
            ,'min_impurity_decrease':hp.quniform('min_impurity_decrease',0,5,1)
    
    }
"""
{'learning_rate': 1.35,
  'max_depth': 8.0,
  'max_features': 0,
  'n_estimators': 200.0,
  'subsample': 0.8},
"""
# round 2  96

para_grids = {
    'n_estimators':hp.quniform('n_estimators',25,200,25)
            ,'learning_rate':hp.quniform('learning_rate',0.5,0.8,0.05)
            ,'loss': "deviance"
            ,'max_depth' : 6
            ,'subsample':hp.quniform('subsample',0.6,1.0,0.05)
            ,'max_features':hp.choice('max_features',[3,4,5])
            ,'min_impurity_decrease':0
    }
"""
({'learning_rate': 0.6000000000000001,
  'loss': 1,
  'max_depth': 6.0,
  'max_features': 1,
  'n_estimators': 75.0,
  'subsample': 0.6000000000000001},
 <hyperopt.base.Trials at 0x7fe6b983d100>)
 """
#round 3
"""
({'learning_rate': 0.65,
  'max_depth': 8.0,
  'max_features': 4,
  'n_estimators': 150.0,
  'subsample': 0.9},
 <hyperopt.base.Trials at 0x7fe6b966c190>)
"""

"\n({'learning_rate': 0.65,\n  'max_depth': 8.0,\n  'max_features': 4,\n  'n_estimators': 150.0,\n  'subsample': 0.9},\n <hyperopt.base.Trials at 0x7fe6b966c190>)\n"

In [96]:
param_hyperopt(30)

100%|██████████| 30/30 [09:15<00:00, 18.50s/trial, best loss: 0.9027788649706459]
-------
best parameters: {'learning_rate': 0.75, 'max_features': 1, 'n_estimators': 200.0, 'subsample': 0.6000000000000001}


({'learning_rate': 0.75,
  'max_features': 1,
  'n_estimators': 200.0,
  'subsample': 0.6000000000000001},
 <hyperopt.base.Trials at 0x7fe6ab86b790>)

In [99]:
clf=GBC(n_estimators=150
            ,learning_rate=0.75
            ,max_depth=6
            ,subsample=0.6
            ,max_features=4
            ,min_impurity_decrease=0
            ,random_state=4869,
            verbose=False
            )
clf.fit(X_train,y_train)

GradientBoostingClassifier(learning_rate=0.75, max_depth=6, max_features=4,
                           min_impurity_decrease=0, n_estimators=150,
                           random_state=4869, subsample=0.6, verbose=False)

In [101]:
score=clf.predict_proba(X_test)[:,1]
evaluate(y_test,score)

0.93649975295912

In [102]:
data_path3='./data/Test_A'
test1_files = glob(data_path3+'/*.pkl')
X_val,_=load_data(test1_files,label=False)
_mean = np.mean(X_val, axis=0)
_std = np.std(X_val, axis=0)
X_val = (X_val - _mean) / (_std + 1e-4)
y_val_pred = clf.predict(X_val) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) # outlier labels (0 or 1)
y_val_scores = clf.predict_proba(X_val)[:,1]

predict_result={}
for i in tqdm(range(len(test1_files))):
    file=test1_files[i]
    name=file.split('/')[-1]
    predict_result[name]=y_val_scores[i]
predict_score=pd.DataFrame(list(predict_result.items()),columns=['file_name','score'])#列名必须为这俩个
predict_score.to_csv('submision.csv',index = False) #保存为比赛要求的csv文件

100%|██████████| 6234/6234 [00:00<00:00, 746361.75it/s]
