In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier as XGB
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split  # 随机划分数据集
import matplotlib.pyplot as plt
from warnings import simplefilter

simplefilter(action='ignore') #忽略警告

In [2]:
#读取数据

df = pd.read_csv('model_feature.csv')  # 读入csv文件中的特征

print("初始数据如下：")
pd.DataFrame(df)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb8 in position 1: invalid start byte

In [None]:
#分离特征及标签

X = df.drop('fatigue_result', axis=1)  # 移除fatigue_result列,其它列都是特征
Y = df['fatigue_result']  # 提取出标签页

print("提取出标签后的特征如下：")
pd.DataFrame(X)

In [None]:
print("标签如下：")
pd.DataFrame(Y)

In [None]:
#对字符型特征进行特征编码（age_level和is_fatigue）

le = LabelEncoder()# 进行特征编码
X['age_level'] = le.fit_transform(X['age_level'])
X['is_fatigue'] = le.fit_transform(X['is_fatigue'])

print("特征编码后的数据如下：")
pd.DataFrame(X)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2,svd_solver="full")
#加载PCA算法，设置降维后主成分数目为2
 
pca = pca.fit(X) #拟合模型
X_pca=pca.transform(X)

In [None]:
#只保留两个特征时的图像

plt.figure()
plt.scatter(X_pca[Y==0,0],X_pca[Y==0,1],c="green",label="0")
plt.scatter(X_pca[Y==1,0],X_pca[Y==1,1],c="orange",label="1")
plt.scatter(X_pca[Y==2,0],X_pca[Y==2,1],c="red",label="2")
plt.legend()
plt.title("PCA of Data")
plt.show()

In [None]:
pca.explained_variance_  #查看降维后每个新特征向量上所带的信息量大小（可理解为方差大小

In [None]:
pca.explained_variance_ratio_  #查看降维后每个新特征向量所占原始数据信息量的百分比（可理解为方差贡献率

In [None]:
pca.explained_variance_ratio_.sum() #信息保留量

In [None]:
#PCA中，选择合适的n_components参数的过程

pca_line=PCA().fit(X)
plt.plot([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24],np.cumsum(pca_line.explained_variance_ratio_))
plt.xticks([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24])
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance")
plt.show()

In [None]:
# # 参数优化过程
# model_range=range(1,20)
# model_score = []
# for i in model_range:
#     rf = RF(n_estimators=130,random_state=i)
#     # loss=-cross_val_score(rf,x_test,y_test,cv=5)
#     # model_score.append(loss.mean())
#     scores = cross_val_score(rf, X, Y, cv=5)
#     model_score.append(scores.mean())
# plt.plot(model_range,model_score,label = 'test_score')
# plt.xlabel('random_state')
# plt.ylabel('Accuracy')
# plt.show()


#没有进行PCA时的模型表现
x_train, x_test, y_train, y_test = train_test_split(X, Y,random_state = 1,test_size=0.20)

'''随机森林'''
rf = RF(random_state=50,n_estimators=130,max_depth=30)
rf.fit(x_train, y_train)
print(f'随机森林准确率为{rf.score(x_test, y_test)}')


'''XGBoost'''
xgb = XGB(random_state=50,n_estimators=10,eval_metric = 'logloss')
xgb.fit(x_train,y_train)
print(f'XGBoost准确率为{rf.score(x_test, y_test)}')

In [None]:
#寻找最佳降维数据的过程

from sklearn.metrics import confusion_matrix
ans1 = []
ans2 = []
ans1_pca = []
ans2_pca = []


#计算PCA前的AUC
recall_rf = []
recall_xgb = []
FPR_rf = []
FPR_xgb = []
y_true = y_test
prob_rf = rf.predict_proba(x_test)
prob_xgb = xgb.predict_proba(x_test)
probrange_rf = np.linspace(prob_rf[:, 1].min(), prob_rf[:, 1].max(), 50)
probrange_xgb = np.linspace(prob_xgb[:, 1].min(), prob_xgb[:, 1].max(),50)
for i in probrange_rf:
    y_pre = []
    for k in range(prob_rf.shape[0]):
        if prob_rf[k, 1] > i:
            y_pre.append(1)
        else:
            y_pre.append(0)
    C = confusion_matrix(y_true, y_pre)
    recall_rf.append(C[0, 0] / C[0, :].sum())
    FPR_rf.append(C[1, 0] / C[1, :].sum())
for i in probrange_xgb:
    y_pre = []
    for k in range(prob_xgb.shape[0]):
        if prob_xgb[k, 1] > i:
            y_pre.append(1)
        else:
            y_pre.append(0)
    C = confusion_matrix(y_true, y_pre)
    recall_xgb.append(C[0, 0] / C[0, :].sum())
    FPR_xgb.append(C[1, 0] / C[1, :].sum())
    
'''进行特征选择'''
X = X.drop(['is_fatigue','age_level','accumulate_driving_after_maxSpare','accumulate_times_before24',
                            'accumulate_times_after_maxSpare','alarm_face_before_5min'],axis=1)
    
#循环遍历不同的降维数据对结果的影响
for i in  range(1,18):
    
    #PCA前
    ans1.append(round(auc(FPR_rf, recall_rf), 3))
    ans2.append(round(auc(FPR_xgb, recall_xgb), 3))
    
    
    #进行PCA
    n_pca = PCA(n_components=i,svd_solver="full")    #加载PCA算法，设置降维后主成分数目为i
    n_pca = n_pca.fit(X) #拟合模型
    X_pca=n_pca.transform(X)
    x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, Y,random_state = 1, test_size=0.20)
   

    rf_pca = RF(random_state=50,n_estimators=130,max_depth=30)
    rf_pca.fit(x_train_pca, y_train_pca)
    
    xgb_pca = XGB(random_state=50,n_estimators=10,eval_metric = 'logloss')
    xgb_pca.fit(x_train_pca,y_train_pca)

    recall_rf_pca = []
    recall_xgb_pca = []
    FPR_rf_pca = []
    FPR_xgb_pca = []
    y_true = y_test_pca
    prob_rf_pca = rf_pca.predict_proba(x_test_pca)
    prob_xgb_pca = xgb_pca.predict_proba(x_test_pca)
    probrange_rf_pca = np.linspace(prob_rf_pca[:, 1].min(), prob_rf_pca[:, 1].max(), 50)
    probrange_xgb_pca = np.linspace(prob_xgb_pca[:, 1].min(), prob_xgb_pca[:, 1].max(),50)
    for i in probrange_rf_pca:
        y_pre = []
        for k in range(prob_rf_pca.shape[0]):
            if prob_rf_pca[k, 1] > i:
                y_pre.append(1)
            else:
                y_pre.append(0)
        C = confusion_matrix(y_true, y_pre)
        recall_rf_pca.append(C[0, 0] / C[0, :].sum())
        FPR_rf_pca.append(C[1, 0] / C[1, :].sum())
    for i in probrange_xgb_pca:
        y_pre = []
        for k in range(prob_xgb_pca.shape[0]):
            if prob_xgb_pca[k, 1] > i:
                y_pre.append(1)
            else:
                y_pre.append(0)
        C = confusion_matrix(y_true, y_pre)
        recall_xgb_pca.append(C[0, 0] / C[0, :].sum())
        FPR_xgb_pca.append(C[1, 0] / C[1, :].sum())
    
    ans1_pca.append(round(auc(FPR_rf_pca, recall_rf_pca), 6)) #随机森林
    ans2_pca.append(round(auc(FPR_xgb_pca, recall_xgb_pca), 6)) #XGBoost
    
plt.figure()
plt.xlabel("number of components after dimension reduction")
plt.ylabel("AUC of model")
plt.plot(range(1,24,1),ans1,c='darkblue',label = 'RF')
plt.plot(range(1,24,1),ans1_pca,c='lightblue',label = 'RF PCA')
plt.plot(range(1,24,1),ans2,c='red',label = 'XGB')
plt.plot(range(1,24,1),ans2_pca,c='pink',label='XGB PCA')
plt.legend()
plt.show()


In [None]:
#用上段代码测出的最佳降维参数进行PCA

'''随机森林'''
n_pca = PCA(n_components=16,svd_solver="full")    #加载PCA算法，设置降维后主成分数目为16
n_pca = n_pca.fit(X) #拟合模型
X_pca1=n_pca.transform(X)
x_train_pca1, x_test_pca1, y_train_pca1, y_test_pca1 = train_test_split(X_pca1, Y,random_state = 1, test_size=0.20)
   
rf_pca = RF(random_state=50,n_estimators=130,max_depth=30)
rf_pca.fit(x_train_pca1, y_train_pca1)


'''XGBoost'''
n_pca = PCA(n_components=18,svd_solver="full")    #加载PCA算法，设置降维后主成分数目为18
n_pca = n_pca.fit(X) #拟合模型
X_pca2=n_pca.transform(X)
x_train_pca2, x_test_pca2, y_train_pca2, y_test_pca2 = train_test_split(X_pca2, Y,random_state = 1, test_size=0.20)

xgb_pca = XGB(random_state=50,n_estimators=10,eval_metric = 'logloss')
xgb_pca.fit(x_train_pca2,y_train_pca2)

pd.DataFrame(X_pca1) #随机森林降维后的特征数据
pd.DataFrame(X_pca2) #XGBoost降维后的特征数据

In [None]:
 from sklearn.metrics import confusion_matrix

plt.figure(figsize=(12, 8.4))
plt.xlabel('false positive rate', fontsize=16)
plt.ylabel('true positive rate', fontsize=16)
plt.plot([0, 1], [0, 1], c='black', linestyle='--')

#pca前
recall_rf = []
recall_xgb = []
FPR_rf = []
FPR_xgb = []
y_true = y_test
prob_rf = rf.predict_proba(x_test)
prob_xgb = xgb.predict_proba(x_test)
probrange_rf = np.linspace(prob_rf[:, 1].min(), prob_rf[:, 1].max(), 50)
probrange_xgb = np.linspace(prob_xgb[:, 1].min(), prob_xgb[:, 1].max(),50)
for i in probrange_rf:
    y_pre = []
    for k in range(prob_rf.shape[0]):
        if prob_rf[k, 1] > i:
            y_pre.append(1)
        else:
            y_pre.append(0)
    C = confusion_matrix(y_true, y_pre)
    recall_rf.append(C[0, 0] / C[0, :].sum())
    FPR_rf.append(C[1, 0] / C[1, :].sum())
for i in probrange_xgb:
    y_pre = []
    for k in range(prob_xgb.shape[0]):
        if prob_xgb[k, 1] > i:
            y_pre.append(1)
        else:
            y_pre.append(0)
    C = confusion_matrix(y_true, y_pre)
    recall_xgb.append(C[0, 0] / C[0, :].sum())
    FPR_xgb.append(C[1, 0] / C[1, :].sum())
area_rf = "AUC area of RF is " + str(round(auc(FPR_rf, recall_rf), 3))
area_xgb = "AUC area of XGB is " + str(round(auc(FPR_xgb, recall_xgb), 3))
plt.plot(FPR_rf, recall_rf, c='darkblue',label=area_rf)
plt.plot(FPR_xgb, recall_xgb, c='red',label=area_xgb)


#pca后
recall_rf_pca = []
recall_xgb_pca = []
FPR_rf_pca = []
FPR_xgb_pca = []
y_true = y_test_pca
prob_rf_pca = rf_pca.predict_proba(x_test_pca1)
prob_xgb_pca = xgb_pca.predict_proba(x_test_pca2)
probrange_rf_pca = np.linspace(prob_rf_pca[:, 1].min(), prob_rf_pca[:, 1].max(), 50)
probrange_xgb_pca = np.linspace(prob_xgb_pca[:, 1].min(), prob_xgb_pca[:, 1].max(),50)
for i in probrange_rf_pca:
    y_pre = []
    for k in range(prob_rf_pca.shape[0]):
        if prob_rf_pca[k, 1] > i:
            y_pre.append(1)
        else:
            y_pre.append(0)
    C = confusion_matrix(y_true, y_pre)
    recall_rf_pca.append(C[0, 0] / C[0, :].sum())
    FPR_rf_pca.append(C[1, 0] / C[1, :].sum())
for i in probrange_xgb_pca:
    y_pre = []
    for k in range(prob_xgb_pca.shape[0]):
        if prob_xgb_pca[k, 1] > i:
            y_pre.append(1)
        else:
            y_pre.append(0)
    C = confusion_matrix(y_true, y_pre)
    recall_xgb_pca.append(C[0, 0] / C[0, :].sum())
    FPR_xgb_pca.append(C[1, 0] / C[1, :].sum())
area_rf_pca = "(PCA) AUC area of RF is " + str(round(auc(FPR_rf_pca, recall_rf_pca), 3))
area_xgb_pca = "(PCA) AUC area of XGB is " + str(round(auc(FPR_xgb_pca, recall_xgb_pca), 3))
plt.plot(FPR_rf_pca, recall_rf_pca, c='lightblue',label=area_rf_pca)
plt.plot(FPR_xgb_pca, recall_xgb_pca, c='pink',label=area_xgb_pca)

plt.legend()
plt.show()

In [None]:
#模型指标
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score

#PCA前
print("\nPCA之前的指标")
#随机森林
print('\n随机森林')
print(f'准确率为{rf.score(x_test, y_test)}')
print(f'AUC面积为{round(auc(FPR_rf, recall_rf), 6)}')
y_predict = rf.predict(x_test)
print(f'宏F1为{f1_score(y_test, y_predict,average="macro")}')
print(f'微F1为{f1_score(y_test, y_predict,average="micro")}')
print(f'宏查准率为{precision_score(y_test, y_predict,average="macro")}')
print(f'微查准率为{precision_score(y_test, y_predict,average="micro")}')
print(f'宏查全率为{recall_score(y_test, y_predict,average="macro")}')
print(f'微查全率为{recall_score(y_test, y_predict,average="micro")}')

#XGBoost
print('\nXGBoost')
print(f'准确率为{xgb.score(x_test, y_test)}')
print(f'AUC面积为{round(auc(FPR_xgb, recall_xgb), 6)}')
y_predict = xgb.predict(x_test)
print(f'宏F1为{f1_score(y_test, y_predict,average="macro")}')
print(f'微F1为{f1_score(y_test, y_predict,average="micro")}')
print(f'宏查准率为{precision_score(y_test, y_predict,average="macro")}')
print(f'微查准率为{precision_score(y_test, y_predict,average="micro")}')
print(f'宏查全率为{recall_score(y_test, y_predict,average="macro")}')
print(f'微查全率为{recall_score(y_test, y_predict,average="micro")}')


#PCA后
print("\nPCA之后的指标")
#随机森林
print('\n随机森林')
print(f'准确率为{rf_pca.score(x_test_pca1, y_test_pca1)}')
print(f'AUC面积为{round(auc(FPR_rf_pca, recall_rf_pca), 6)}')
y_predict = rf_pca.predict(x_test_pca1)
print(f'宏F1为{f1_score(y_test_pca1, y_predict,average="macro")}')
print(f'微F1为{f1_score(y_test_pca1, y_predict,average="micro")}')
print(f'宏查准率为{precision_score(y_test_pca1, y_predict,average="macro")}')
print(f'微查准率为{precision_score(y_test_pca1, y_predict,average="micro")}')
print(f'宏查全率为{recall_score(y_test_pca1, y_predict,average="macro")}')
print(f'微查全率为{recall_score(y_test_pca1, y_predict,average="micro")}')

#XGBoost
print('\nXGBoost')
print(f'准确率为{xgb_pca.score(x_test_pca2, y_test_pca2)}')
print(f'AUC面积为{round(auc(FPR_xgb_pca, recall_xgb_pca), 6)}')
y_predict = xgb_pca.predict(x_test_pca2)
print(f'宏F1为{f1_score(y_test_pca2, y_predict,average="macro")}')
print(f'微F1为{f1_score(y_test_pca2, y_predict,average="micro")}')
print(f'宏查准率为{precision_score(y_test_pca2, y_predict,average="macro")}')
print(f'微查准率为{precision_score(y_test_pca2, y_predict,average="micro")}')
print(f'宏查全率为{recall_score(y_test_pca2, y_predict,average="macro")}')
print(f'微查全率为{recall_score(y_test_pca2, y_predict,average="micro")}')

In [None]:



# pd.DataFrame(x_test)