# Readme

### 文件格式
 - inputs (请把所有的文件合并到一个文件中)
   - logs
      - xxxx_log.csv
      - ....
   - trace
      - xxxx_trace.csv
      - ....
   - metric
      - xxxx_metric.csv
      - ....
 - labels
   - training_label.csv
   - training_ground_truth.csv
   
### 代码思路
    1. 合并数据文件，多线程处理
    2. 特征工程
    3. OVR + Xgboost + MultiSKF 训练
    4. 数据表逆透视生成提交文件

### 线下得分
|                             |   score   |
|-----------------------------|:---------:|
| LTE4MDk5Mzk2NjU1NjM1ODI0NDc= |  0.745216 |
| LTcxMDU4NjY3NDcyMTgwNTc5MDE= |  0.890485 |
| LTkyMDExNjM1MjY4NDg4ODU5Mjk= |  0.785117 |
| NDExNzk3NjQ4ODg3NTY0OTQ3OA== |  0.796378 |
| NO_FAULT                    |  0.723080 |
| ODI4MTMxNDkzODEzNTg5OTE4Mg== |  0.798341 |
| node-worker1                |  0.718737 |
| node-worker2                |  0.791683 |
| node-worker3                |  0.734310 |
| Weighted AVG.               |  0.775928 |

### 线上得分
sAUC = 0.6443698288049552	


In [1]:
import os
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm.auto import tqdm
from xgboost import XGBClassifier


def sScore(y_true, y_pred):
    score = []
    for i in range(num_classes):
        score.append(roc_auc_score(y_true[:, i], y_pred[:, i]))
        
    return score

def processing_feature(file):
    log, trace, metric, metric_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    if os.path.exists(f"./inputs/log/{file}_log.csv"):
        log = pd.read_csv(f"./inputs/log/{file}_log.csv").sort_values(by=['timestamp']).reset_index(drop=True)
    
    if os.path.exists(f"./inputs/trace/{file}_trace.csv"):
        trace = pd.read_csv(f"./inputs/trace/{file}_trace.csv").sort_values(by=['timestamp']).reset_index(drop=True)
        
    if os.path.exists(f"./inputs/metric/{file}_metric.csv"):
        metric = pd.read_csv(f"./inputs/metric/{file}_metric.csv").sort_values(by=['timestamp']).reset_index(drop=True)
    
    feats = {"id" : file}
    if len(trace) > 0:
        feats['trace_length'] = len(trace)
        feats[f"trace_status_code_std"] = trace['status_code'].apply("std")
        
        for stats_func in ['mean', 'std', 'skew', 'nunique']:
            feats[f"trace_timestamp_{stats_func}"] = trace['timestamp'].apply(stats_func)
            
        for stats_func in ['nunique']:
            for i in ['host_ip', 'service_name', 'endpoint_name', 'trace_id', 'span_id', 'parent_id', 'start_time', 'end_time']:
                feats[f"trace_{i}_{stats_func}"] = trace[i].agg(stats_func)
                
    else:
        feats['trace_length'] = -1
                
    if len(log) > 0:
        feats['log_length'] = len(log)
        log['message_length'] = log['message'].fillna("").map(len)
        log['log_info_length'] = log['message'].map(lambda x:x.split("INFO")).map(len)
        
    else:
        feats['log_length'] = -1

    if len(metric) > 0:
        feats['metric_length'] = len(metric)
        feats['metric_value_timestamp_value_mean_std'] = metric.groupby(['timestamp'])['value'].mean().std()
        
    else:
        feats['metric_length'] = -1

    return feats

def gen_label(train):
    col = np.zeros((train.shape[0], 9))
    for i, label in enumerate(train['label'].values):
        col[i][label] = 1
        
    return col

### 特征工程

In [2]:
all_ids = set([i.split("_")[0] for i in os.listdir("./inputs/metric/")]) |\
          set([i.split("_")[0] for i in os.listdir("./inputs/log/")]) |\
          set([i.split("_")[0] for i in os.listdir("./inputs/trace/")])
all_ids = list(all_ids)
print("IDs Length =", len(all_ids))
feature = pd.DataFrame(Parallel(n_jobs=16, backend="multiprocessing")(delayed(processing_feature)(f) for f in tqdm(all_ids)))

IDs Length = 26442


  0%|          | 0/26442 [00:00<?, ?it/s]

In [3]:
label = pd.read_csv("./labels/training_label.csv")
lb_encoder = LabelEncoder()
label['label'] = lb_encoder.fit_transform(label['source'])

In [4]:
all_data = feature.merge(label[['id', 'label']].groupby(['id'], as_index=False)['label'].agg(list), how='left', on=['id']).set_index("id")
not_use = ['id', 'label']
feature_name = [i for i in all_data.columns if i not in not_use]
X = all_data[feature_name].replace([np.inf, -np.inf], 0).clip(-1e9, 1e9)
print(f"Feature Length = {len(feature_name)}")
print(f"Feature = {feature_name}")

Feature Length = 17
Feature = ['trace_length', 'trace_status_code_std', 'trace_timestamp_mean', 'trace_timestamp_std', 'trace_timestamp_skew', 'trace_timestamp_nunique', 'trace_host_ip_nunique', 'trace_service_name_nunique', 'trace_endpoint_name_nunique', 'trace_trace_id_nunique', 'trace_span_id_nunique', 'trace_parent_id_nunique', 'trace_start_time_nunique', 'trace_end_time_nunique', 'log_length', 'metric_length', 'metric_value_timestamp_value_mean_std']


### OVR + MSKFold

In [5]:
num_classes = 9
n_splits = 5
kf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=3407, shuffle=True)
scaler = StandardScaler()
scaler_X = scaler.fit_transform(X.fillna(0).replace([np.inf, -np.inf], 0))

y = gen_label(all_data[all_data['label'].notnull()])
train_scaler_X = scaler_X[all_data['label'].notnull()]
test_scaler_X = scaler_X[all_data['label'].isnull()]

ovr_oof = np.zeros((len(train_scaler_X), num_classes))
ovr_preds = np.zeros((len(test_scaler_X), num_classes))

for train_index, valid_index in kf.split(train_scaler_X, y):
    X_train, X_valid = train_scaler_X[train_index], train_scaler_X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    clf = OneVsRestClassifier(XGBClassifier(random_state=0, n_jobs=8))
    clf.fit(X_train, y_train)
    ovr_oof[valid_index] = clf.predict_proba(X_valid)
    ovr_preds = clf.predict_proba(test_scaler_X) / n_splits
    score = sScore(y_valid, ovr_oof[valid_index])
    print(f"Score = {np.mean(score)}")

Score = 0.7730915826391094
Score = 0.7729447913392381
Score = 0.7705186347617492
Score = 0.7856047168307696
Score = 0.7797373690996593


In [6]:
each_score = sScore(y, ovr_oof)
score_metric = pd.DataFrame(each_score, columns=['score'], index=list(lb_encoder.classes_))
score_metric.loc["Weighted AVG.", "score"] = np.mean(score_metric['score'])
print(score_metric)

                                 score
LTE4MDk5Mzk2NjU1NjM1ODI0NDc=  0.745216
LTcxMDU4NjY3NDcyMTgwNTc5MDE=  0.890485
LTkyMDExNjM1MjY4NDg4ODU5Mjk=  0.785117
NDExNzk3NjQ4ODg3NTY0OTQ3OA==  0.796378
NO_FAULT                      0.723080
ODI4MTMxNDkzODEzNTg5OTE4Mg==  0.798341
node-worker1                  0.718737
node-worker2                  0.791683
node-worker3                  0.734310
Weighted AVG.                 0.775928


### Submit

In [7]:
submit = pd.DataFrame(ovr_preds, columns=lb_encoder.classes_)
submit.index = X[all_data['label'].isnull()].index
submit.reset_index(inplace=True)
submit = submit.melt(id_vars="id", value_vars=lb_encoder.classes_, value_name="score", var_name="source")
submit.to_csv("baseline.csv", index=False)