# 模型算法

## 各种采样算法

In [3]:
from sklearn import metrics
from collections import Counter

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss,  TomekLinks, EditedNearestNeighbours

def evalute_model(X_train, X_test, y_train, y_test, model):
    """  使用指定指标评估模型效果
    
    Args:
        X_train: 训练集特征向量
        X_test: 测试集特征向量
        y_train: 训练集标签
        y_test: 测试集标签
        model: 模型
    
    Return:
        result:字典类型, 具体如下
        result["accuracy"] : 准确率
        result['precision']: 精确率
        result['recall']: 召回率
        result["fscore"]: f值
        result["n_occurences"]:  真实标签的数量
        result["predictions_count"]: 预测标签的数量
        result['tp']:TP
        result['tn']:TN
        result['fp']:FP
        result['auc']: AUC
    """
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, predictions)
    # 加上pos_label=1, average='binary' 参数可以只得到类别标签为'1'的各项参数
    precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, predictions) 
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label = 1) # pos_label 指定正类标签
    auc = metrics.auc(fpr, tpr)
    
    result = {'accuracy': accuracy,'precision':precision,'recall':recall,
             'fscore':fscore, 'n_occurences':support,
             'predictions_count': Counter(predictions),
             'tp':tp, 'tn':tn, 'fp':fp,'fn':fn, 'auc':auc}
    
    return result
    
    
def model_resampling_pipeline(X_train, X_test, y_train, y_test, model):
    """ 测试各种采样方法对建模的影响
    
    Args:
        X_train: 训练集特征向量
        X_test: 测试集特征向量
        y_train: 训练集标签向量
        y_test: 测试集标签向量
        model: 模型
        
    Return:
        results: 结果,包含原始数据，加权数据，过采样、欠采样这四种方式处理数据后的建模结果
    """
    
    results = {'ordinary':{},
              'class_weight':{},
               'oversample':{},
               'undersample':{}}
    
    # ------- 原始数据 ----------
    results['ordinary'] = evalute_model(X_train, X_test, y_train, y_test, model)
    
    # ------- Class weight -------
    if 'class_weight' in model.get_params().keys():
        model.set_params(class_weight='balanced')
        results['class_weight'] = evalute_model(X_train, X_test, y_train, y_test, model)
    
    # ------ OverSampling techniques -----
    print('-------- Oversampling methods ---------')
    #techniques = [RandomOverSampler(), SMOTE(), ADASYN()]
    techniques = [RandomOverSampler(), SMOTE()] # ADASYN() MAC上跑不出来
    for sampler in techniques:
        technique = sampler.__class__.__name__
        print(f'Technique:{technique}')
        print(f'Before resampling: {sorted(Counter(y_train).items())}')
        X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)
        print(f'After resampling: {sorted(Counter(y_resampled).items())}')
        
        results['oversample'][technique] = evalute_model(X_resampled, X_test, y_resampled, y_test, model)
        
    # ------ UnderSampling techniques --------
    print('-------- Undersampling methods ---------')
    techniques = [RandomUnderSampler(), 
                 NearMiss(version=1),
                 NearMiss(version=2)]
                 #TomekLinks(), MAC 跑不出来
                # EditedNearestNeighbours()], MAC 跑不出来
    
    for sampler in techniques:
        technique = sampler.__class__.__name__
        if technique == 'NearMiss':
            technique += str(sampler.version)
        print(f'Technique:{technique}')
        print(f'Before resampling: {sorted(Counter(y_train).items())}')
        X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)
        print(f'After resampling: {sorted(Counter(y_resampled).items())}')
        
        results['undersample'][technique] = evalute_model(X_resampled, X_test, y_resampled, y_test, model)
    
    return results

def evaluate_method(results, method, metrics = ['precision', 'recall', 'fscore']):
    """ 可视化结果
    
    对精确率、召回率、f值、AUC值在各个采样方法下的效果进行可视化
    
    Args:
        results: 结果数据，包含各种指标数据
        method: 取值为"oversample", "undersample"
        metrics: 度量指标，精确率，召回率，f值
    
    Return:
        None
    """
    fig, ax = plt.subplots(1, 7, sharey = True, figsize=(20,6)) #sharey 控制y轴属性
    
    for i, metric in enumerate(metrics):
        # 绘制原始数据建模得到的指标基线
        ax[i*2].axhline(results['ordinary'][metric][0], label = 'No Resampling')
        ax[i*2+1].axhline(results['ordinary'][metric][1], label = 'No Resampling')
        
        if results['class_weight']:
            ax[i*2].bar(0, results['class_weight'][metric][0], label = 'Adjust Class Weight')
            ax[i*2+1].bar(0, results['class_weight'][metric][1], label = 'Adjust Class Weight')
            
        #ax[0].legend(loc='uppper center', bbox_to_anchor=(9,1.01),
                   # ncol=1, fancybox=True, shadow=True)
        ax[0].legend()
        
        for j, (technique, result) in enumerate(results[method].items()):
            ax[i*2].bar(j+1, result[metric][0], label = technique)
            ax[i*2+1].bar(j+1, result[metric][1], label = technique)
            
        ax[i*2].set_title(f'Alexa domain:\n{metric}')
        ax[i*2+1].set_title(f'DGA domain:\n{metric}')
        
    # AUC
    ax[6].set_title(f'Area under curve')
    ax[6].axhline(results['ordinary']['auc'], label = 'No Resampling')
    if results['class_weight']:
        ax[6].bar(0, results['class_weight']['auc'], label='Adjust Class Weight')
    for j, (technique, result) in enumerate(results[method].items()):
        ax[6].bar(j+1, result['auc'], label= technique)

In [None]:
# 函数调用顺序
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')

## 定义各种模型

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

model_tree = DecisionTreeClassifier()
model_rfc = RandomForestClassifier()
model_svc = SVC()
model_lg = LogisticRegression()
model_knn = KNeighborsClassifier()
model_ada = AdaBoostClassifier()
model_mlp = MLPClassifier()