In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.special import xlogy

# 自带特征重要性
from autogluon.tabular import TabularDataset, TabularPredictor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SMOTEN, BorderlineSMOTE
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, NearMiss, EditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, TomekLinks, RepeatedEditedNearestNeighbours, NeighbourhoodCleaningRule, InstanceHardnessThreshold
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

# 数据预处理

In [14]:
def data_preprocess(path):
    df = pd.read_csv(path, encoding='gbk', dtype=float)
    # 哑变量处理
    raw_col = ['a1b1_a', 'a1e1', 'a1_1', 'a2', 'a3', 'a4a', 'a5', 'b1', 'b8b', 'c1_0', 'c3a',
                'd1_1', 'd1_2', 'd1_3', 'd1_4', 'd1_5', 'd1_6', 'd1_7', 'd1_8', 'd1_9', 'd1_10', 'd1_11',
                'd4b2_1', 'd4b2_2', 'd4b2_3', 'd4b2_4', 'd4b2_5', 'd4b2_6', 'd4b2_7', 'd4b2_8', 'd4b2_9', 'd4b2_10', 'd4b2_11',
                'd4b3_1', 'd4b3_2', 'd4b3_3', 'd4b3_4', 'd4b3_5', 'd4b3_6', 'e1a_1', 'e1a_2', 'e1a_3', 'e1a_4', 'e1a_5', 'e1a_6',
                'f4b2_1', 'f4b2_2', 'f4b2_3',   'f4b3_1', 'f4b3_2', 'f4b3_3','g4a', 'g4b',
                'g5_1', 'g5_2', 'g5_3', 'g5_4', 'g5_5', 'g5_6', 'g5_7', 'g5_8', 'g5_9', 'g5_10', 'g5_12', 'g5_13', 'g5_14', 'g5_11', 'g5_15',
                'g7_1', 'g7_2', 'g7_3', 'g7_4', 'g7_5', 'g7_6', 'g7_7', 'g7_8', 'g7_9', 'g7_10', 'g7_11', 'g7_12', 'g7_13', 'g7_14', 'g7_15', 'g7_16', 'g7_17', 'g7_18', 'g7_19', 'g7_20', 'g7_21',
                'h1a_1', 'h1a_2', 'h1a_3', 'h1a_4', 'h1a_5', 'h1a_6', 'h1c_a',
                'i1_1', 'i1_2', 'i1_3', 'i1_4', 'i1_5', 'i1_6', 'i1_7', 'i1_8', 'i1_9', 'i1_10', 'i1_12', 'i1_13', 'i1_11',
                'i6', 'i7', 'i8_1', 'i8_2', 'i8_3', 'i8_4', 'i8_5', 'i8_6', 'i8_7', 'i8_8',
                'i9_1', 'i9_2', 'i9_3', 'i9_4', 'i9_5', 'i9_6', 'i9_7', 'i9_8', 'i9_9', 'i9_10', 'i9_11', 'i9_12', 'i9_13', 'i9_14',
                'cx2', 'dzx']
    dummies_col = [df]
    for i in range(len(raw_col)):
        dummies_col.append(pd.get_dummies(df[raw_col[i]], prefix=raw_col[i]))
    
    df = pd.concat(dummies_col, axis=1)
    df.drop(columns=raw_col, axis=1, inplace=True)
    df.drop(columns = ['ID'], inplace=True)

    y = df['tag']
    df = df.drop(columns = ['tag'])
    x = df
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    return X_train, X_test, y_train, y_test

# 数据采样

In [15]:
# 数据采样方法
sampling_methods = {
    'None': None,
    # -- 上采样方法 --
    'RandomOverSampler': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(),
   # 'ADASYN': ADASYN(),
   # 'BorderlineSMOTE': BorderlineSMOTE(),
   # 'SMOTEN': SMOTEN(),
    # -- 下采样方法 --
    'ClusterCentroids': ClusterCentroids(random_state=42),
    #'TomekLinks': TomekLinks(*, sampling_strategy='auto', n_jobs=None),
    'RandomUnderSampler': RandomUnderSampler(random_state=42),
   # 'NearMiss-1': NearMiss(version=1),
   # 'NearMiss-2': NearMiss(version=2),
   # 'NearMiss-3': NearMiss(version=3),
    #'EditedNearestNeighbours': EditedNearestNeighbours(),
    #'AllKNN': AllKNN(),
   # 'CondensedNearestNeighbour': CondensedNearestNeighbour(random_state=42),
   # 'OneSidedSelection': OneSidedSelection(random_state=42),
   # 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(),
   # 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule(),
   # 'InstanceHardnessThreshold': InstanceHardnessThreshold(random_state=42, estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
  #  # -- 上采样和下采样混合方法 --
   # 'SMOTEENN': SMOTEENN(random_state=42),
    'SMOTETomek': SMOTETomek(random_state=42)
}


# 定义模型

In [16]:
# AdaCast
class AdaCostClassifier(AdaBoostClassifier):
    
    def _boost_real(self, iboost, X, y, sample_weight, random_state):
        '''
        权重更新的公式在这里
        '''
        estimator = self._make_estimator(random_state=random_state)
        estimator.fit(X, y, sample_weight=sample_weight)

        y_predict_proba = estimator.predict_proba(X)

        if iboost == 0:
            self.classes_ = getattr(estimator, 'classes_', None) # 获取estimator的classes_属性值
            self.n_classes_ = len(self.classes_)

        y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0)

        # 分类不正确的实例
        incorrect = y_predict != y

        # 误差分数
        estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))

        # 如果分类器完美，那么就停止
        if estimator_error <= 0:
            return sample_weight, 1.0, 0.0

        n_classes = self.n_classes_
        classes = self.classes_
        y_codes = np.array([-1.0 / (n_classes - 1), 1.0])
        y_coding = y_codes.take(classes == y[:, np.newaxis])

        proba = y_predict_proba  # 别名
        np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)

        estimator_weight = (
            -1.0
            * self.learning_rate
            * ((n_classes - 1.0) / n_classes)
            * xlogy(y_coding, y_predict_proba).sum(axis=1)
        )

        # 在此处更新，增加代价敏感系数
        if not iboost == self.n_estimators - 1:
            # Only boost positive weights
            sample_weight *= np.exp(
                estimator_weight * ((sample_weight > 0) | (estimator_weight < 0)) * self._beta(y, y_predict)
            )

        return sample_weight, 1.0, estimator_error

    def _beta(self, y, y_hat):
        '''
        代价调整函数
        '''
        res = []
        for i in zip(y, y_hat):
            if i[0] == i[1]:
                res.append(1) # 正确分类，系数保持不变
            elif i[0] == 1 and i[1] == -1:
                res.append(1) # 将正类（好人）判断为负类（坏人）代价更大，系数增大
            elif i[0] == -1 and i[1] == 1:
                res.append(1) # 将负类（坏人）判断为正类（好人）代价更大，系数增大
            else:
                print(i[0], i[1])

        return np.array(res)

In [22]:
from sklearn.preprocessing import StandardScaler
# 使用LogisticRegression类构建Logistic回归模型
from sklearn.linear_model import LogisticRegression
def LogisticR(X_train, y_train, X_test, y_test):
    lr_model = LogisticRegression(penalty='l2', solver='liblinear',random_state=0)
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    return y_pred
# AutoGluon
# AutoGluon
def auto_gluon(X_train, y_train, X_test, y_test):
    # 分别合并训练数据和标签、测试数据和标签
    train_data = pd.concat([X_train, y_train], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)
    train_data = TabularDataset(train_data)
    test_data = TabularDataset(test_data)

    # 训练
    predictor = TabularPredictor(label='tag', path='ag_models').fit(train_data)
    
    # 预测
    test_data_nolab = test_data.drop(columns = ['tag'])
    predictor = TabularPredictor.load('ag_models')
    y_pred = predictor.predict(test_data_nolab)
    return y_pred

# BaggingDT
def BaggingDT(X_train, y_train, X_test, y_test):
    bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0)
    bc.fit(X_train, y_train)
    y_pred = bc.predict(X_test)
    return y_pred

# BalancedBaggingDT
def BalancedBaggingDT(X_train, y_train, X_test, y_test):
    bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                    n_estimators=100,
                                    sampling_strategy='auto',
                                    replacement=False,
                                    random_state=42)
    bbc.fit(X_train, y_train)
    y_pred = bbc.predict(X_test)
    return y_pred

# BalancedRF
def BalancedRF(X_train, y_train, X_test, y_test):
    brf = BalancedRandomForestClassifier(n_estimators=200, random_state=42)
    brf.fit(X_train, y_train)
    y_pred = brf.predict(X_test)
    return y_pred

# RUSBoost
def RUSBoost(X_train, y_train, X_test, y_test):
    rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
                                random_state=0)
    rusboost.fit(X_train, y_train)
    y_pred = rusboost.predict(X_test)
    return y_pred

# EasyEnsemble
def EasyEnsemble(X_train, y_train, X_test, y_test):
    eec = EasyEnsembleClassifier(random_state=0)
    eec.fit(X_train, y_train)
    y_pred = eec.predict(X_test)
    return y_pred

# AdaCast
def AdaCast(X_train, y_train, X_test, y_test):
    acc = AdaCostClassifier(n_estimators=100)
    acc.fit(X_train, y_train)
    y_pred = acc.predict(X_test)
    # y_pred -1变为1，1变为0
    y_pred = np.where(y_pred == -1, 1, 0)
    return y_pred

classifiers = {
    #'AutoGluon': auto_gluon,
   # 'BaggingDT': BaggingDT,
   # 'BalancedBaggingDT': BalancedBaggingDT,
   # 'BalancedRF': BalancedRF,
    #'RUSBoost': RUSBoost,
    #'EasyEnsemble': EasyEnsemble,
   # 'AdaCast': AdaCast,
    'LogisticR': LogisticR
}

In [18]:
def print_result(y_pred, y_test):
    print('Accuracy: %lf' % accuracy_score(y_test, y_pred))
    print('Balanced Accuracy: %lf' % balanced_accuracy_score(y_test, y_pred))
    print('Precision: %lf' % precision_score(y_test, y_pred))
    print('Recall: %lf' % recall_score(y_test, y_pred))
    print('F1: %lf' % f1_score(y_test, y_pred))
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

def result_to_csv_item(y_pred, y_test):
    item1 = accuracy_score(y_test, y_pred)
    item2 = balanced_accuracy_score(y_test, y_pred)
    item3 = precision_score(y_test, y_pred)
    item4 = recall_score(y_test, y_pred)
    item5 = f1_score(y_test, y_pred)
    item6 = confusion_matrix(y_test, y_pred)
    return f'{item1},{item2},{item3},{item4},{item5},{item6[0][0]},{item6[0][1]},{item6[1][0]},{item6[1][1]}'

In [19]:
def draw_confusion_matrix(y_test, y_pred, suffix='none'):
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=[0, 1]).plot(cmap=plt.cm.Blues)
    plt.tight_layout()
    plt.savefig(f'cms/confusion_matrix_{suffix}.png')

# 训练

In [20]:
def classify_and_sampling(classifier, method, X_train, y_train, X_test, y_test):
    # 采样
    if method:
        X_train, y_train = method.fit_resample(X_train, y_train)
    # 采样后数据集大小
    print('采样后训练集维数为：', X_train.shape, '\n', '样本个数为：', y_train.shape)
    # 训练和评估
    y_pred = classifier(X_train, y_train, X_test, y_test)
    return y_pred

In [23]:
import os
#os.system("export OMP_NUM_THREADS=8")
# 数据预处理
X_train, X_test, y_train, y_test = data_preprocess('数据集1.csv')
print('训练集维数为：', X_train.shape, '\t', '样本个数为：', y_train.shape)
print('测试集维数为：', X_test.shape, '\t', '样本个数为：', y_test.shape)

# 保存模型结果
f_res = open('result.csv', 'a')
f_res.write('Model,Sampler,Accuracy,Balanced Accuracy,Precision,Recall,F1-Score,A,B,C,D\n')
# 选择模型
for classifier in classifiers:
    #if classifier != 'AdaCast':
       # continue
    if classifier == 'AdaCast':
        # AdaCast需要将标签转换为-1和1
        y_train = np.where(y_train == 0, 1, -1)
    
    # 选择采样方法
    for method in sampling_methods:

       # if method != 'ClusterCentroids':
            #continue

        print("Classifier:", classifier, '\tSampling method:', method)
        # 训练
        y_pred = classify_and_sampling(classifiers[classifier], sampling_methods[method], X_train, y_train, X_test, y_test)

        # 指标
       # print_result(y_pred, y_test)
       # draw_confusion_matrix(y_test, y_pred, f'{classifier}_{method}')

        f_res.write(f'{classifier},{method},{result_to_csv_item(y_pred, y_test)}\n')    
        print('------------------------------------------')
        f_res.flush()

f_res.close()

训练集维数为： (3101, 465) 	 样本个数为： (3101,)
测试集维数为： (776, 465) 	 样本个数为： (776,)
Classifier: LogisticR 	Sampling method: None
采样后训练集维数为： (3101, 465) 
 样本个数为： (3101,)
------------------------------------------
Classifier: LogisticR 	Sampling method: RandomOverSampler
采样后训练集维数为： (3932, 465) 
 样本个数为： (3932,)
------------------------------------------
Classifier: LogisticR 	Sampling method: SMOTE
采样后训练集维数为： (3932, 465) 
 样本个数为： (3932,)
------------------------------------------
Classifier: LogisticR 	Sampling method: ClusterCentroids
采样后训练集维数为： (2270, 465) 
 样本个数为： (2270,)
------------------------------------------
Classifier: LogisticR 	Sampling method: RandomUnderSampler
采样后训练集维数为： (2270, 465) 
 样本个数为： (2270,)
------------------------------------------
Classifier: LogisticR 	Sampling method: SMOTETomek
采样后训练集维数为： (3390, 465) 
 样本个数为： (3390,)
------------------------------------------
