In [1]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import label_binarize

In [2]:
filepath ='Categorical_fusion_4/fusion_categorical_1234.csv'
data = pd.read_csv(filepath)
x = data.iloc[:, :-1]  # 数据特征
y = data.iloc[:,-1]  # 标签

In [3]:
x

Unnamed: 0,opensmile_prediction_p,vggish_prediction_p,hog_prediction_p,pnn_lstm_prediction
0,0,3,1,1
1,0,4,1,1
2,4,0,5,0
3,4,0,3,3
4,0,4,4,0
...,...,...,...,...
3634,4,4,1,1
3635,4,0,0,4
3636,0,0,4,4
3637,2,4,2,2


In [4]:
# 将数据划分为训练集和测试集，test_size=.3表示30%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=423)

In [5]:
x_train

Unnamed: 0,opensmile_prediction_p,vggish_prediction_p,hog_prediction_p,pnn_lstm_prediction
1612,3,0,3,3
1354,4,4,4,4
3609,3,3,4,3
1722,0,0,4,0
3567,4,2,4,4
...,...,...,...,...
1984,0,0,3,3
3482,3,3,3,3
3355,4,4,4,4
56,0,0,0,0


In [6]:
# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# Y 将标签编码
encoder = LabelEncoder().fit(y_train) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
y = encoder.transform(y_train)
print("class = ", y)
y_train = pd.DataFrame(encoder.transform(y_train)) # 使用训练好的LabelEncoder对源数据进行编码
y_test = pd.DataFrame(encoder.transform(y_test))
print("y_test = ", y_test[0:10])

# 标签降维度
y_train = y_train.iloc[:,0].ravel()
y_test = y_test.iloc[:,0].ravel()
print("y.shape=", y_train.shape)

# X标准化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # 标准化
x_test_std = scaler.fit_transform(x_test)

class =  [3 4 3 ... 4 0 4]
y_test =     0
0  0
1  1
2  2
3  3
4  0
5  5
6  0
7  0
8  0
9  3
y.shape= (2547,)


In [None]:
times_all = time()
clf = OneVsRestClassifier(
        SVC(kernel = 'linear', 
        gamma = 0.00010537927559772052,
        degree=1, 
        cache_size=5000, 
        class_weight='balanced'))
    clf.fit(x_train_std, y_train)

In [8]:
import tqdm
times_all = time()
# 调试两个参数 gamma & C  ，默认情况下C为1，通常来说这都是一个合理的参数。
accuracy_list = []
f1_list = []
auc_list = []

gamma_range = np.logspace(0, 4, 100, base=2) # 返回13个数字，底是2
print("gamma_rang:", gamma_range)
count=1
for gamma_item in gamma_range:
    time0 = time()
    print("Start-{0}, gamma={1}".format(count, gamma_item))
    count = count+1
    clf = OneVsRestClassifier(
        SVC(kernel = 'linear', 
        gamma = gamma_item,
        degree=1, 
        cache_size=5000, 
        class_weight='balanced'))
    clf.fit(x_train_std, y_train)
    
    y_test_prediction = clf.predict(x_test_std)
    # accuracy
    accuracy = accuracy_score(y_test,y_test_prediction)
    accuracy_list.append(accuracy)
    print("accuracy = ", accuracy)
    # F1-score
    f1 = f1_score(y_test,y_test_prediction, average="weighted")
    print("f1 score = ", f1)
    f1_list.append(f1)
    # AUC
    y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
    result = clf.decision_function(x_test_std)
    auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
    print("AUC = ", auc)
    auc_list.append(auc)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print()

print("Gamma = ", gamma_range[accuracy_list.index(max(accuracy_list))])
print("Accuracy = ", max(accuracy_list))
print("F1-score = ", f1_list[accuracy_list.index(max(accuracy_list))])
print("AUC-score = ", auc_list[accuracy_list.index(max(accuracy_list))]) 
print(datetime.datetime.fromtimestamp(time()-times_all).strftime("%M:%S:%f"))

gamma_rang: [ 1.          1.0284018   1.05761026  1.0876483   1.11853947  1.150308
  1.18297882  1.21657755  1.25113054  1.2866649   1.3232085   1.36079
  1.39943889  1.43918547  1.48006093  1.52209732  1.56532762  1.60978575
  1.65550656  1.70252593  1.75088073  1.80060889  1.85174942  1.90434244
  1.95842919  2.01405211  2.07125481  2.13008218  2.19058035  2.25279677
  2.31678026  2.38258098  2.45025057  2.5198421   2.59141015  2.66501086
  2.74070197  2.81854284  2.89859453  2.98091983  3.06558332  3.1526514
  3.24219238  3.33427648  3.42897593  3.52636502  3.62652013  3.72951983
  3.83544491  3.94437845  4.0564059   4.17161513  4.2900965   4.41194297
  4.53725009  4.66611616  4.79864226  4.93493233  5.07509329  5.21923508
  5.36747075  5.51991658  5.67669215  5.83792042  6.00372787  6.17424455
  6.34960421  6.5299444   6.71540657  6.90613621  7.1022829   7.30400052
  7.51144729  7.72478591  7.94418373  8.16981285  8.40185024  8.64047791
  8.88588304  9.13825811  9.39780109  9.66471

accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:196294

Start-53, gamma=4.290096504220116
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:232679

Start-54, gamma=4.411942967067392
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:191001

Start-55, gamma=4.537250088781851
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:202080

Start-56, gamma=4.666116158304467
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:200645

Start-57, gamma=4.798642256159061
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:148007

Start-58, gamma=4.93493233373827
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7217517677682515
00:01:236038

Start-59, gamma=5.075093294841398
accuracy =  0.2793040293040293
f1 score =  0.2080

In [9]:
import tqdm
times_all = time()
# 调试两个参数 gamma & C  ，默认情况下C为1，通常来说这都是一个合理的参数。
accuracy_list = []
f1_list = []
auc_list = []

c_range =  np.logspace(-10, 2, 100, base=2) # 返回13个数字，底是2
print("c_range:",c_range)
print()
count=1
for c_item in c_range:
    time0 = time()
    print("Start-{0}, c_item={1}".format(count, c_item))
    count = count+1
    clf = OneVsRestClassifier(
        SVC(kernel = 'linear',
        C = c_item,
        degree=1, 
        cache_size=5000, 
        class_weight='balanced'))
    clf.fit(x_train_std, y_train)
    
    y_test_prediction = clf.predict(x_test_std)
    # accuracy
    accuracy = accuracy_score(y_test,y_test_prediction)
    accuracy_list.append(accuracy)
    print("accuracy = ", accuracy)
    # F1-score
    f1 = f1_score(y_test,y_test_prediction, average="weighted")
    print("f1 score = ", f1)
    f1_list.append(f1)
    # AUC
    y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
    result = clf.decision_function(x_test_std)
    auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
    print("AUC = ", auc)
    auc_list.append(auc)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print()
    print()

print(max(accuracy_list), gamma_range[accuracy_list.index(max(accuracy_list))])
print("F1-score = ", f1_list[accuracy_list.index(max(accuracy_list))])
print("AUC-score = ", auc_list[accuracy_list.index(max(accuracy_list))]) 
print(datetime.datetime.fromtimestamp(time()-times_all).strftime("%M:%S:%f"))

c_range: [9.76562500e-04 1.06215654e-03 1.15525275e-03 1.25650869e-03
 1.36663954e-03 1.48642317e-03 1.61670562e-03 1.75840712e-03
 1.91252851e-03 2.08015838e-03 2.26248072e-03 2.46078330e-03
 2.67646677e-03 2.91105452e-03 3.16620349e-03 3.44371584e-03
 3.74555167e-03 4.07384290e-03 4.43090829e-03 4.81926986e-03
 5.24167065e-03 5.70109416e-03 6.20078536e-03 6.74427364e-03
 7.33539774e-03 7.97833286e-03 8.67762015e-03 9.43819878e-03
 1.02654408e-02 1.11651892e-02 1.21437991e-02 1.32081824e-02
 1.43658571e-02 1.56250000e-02 1.69945046e-02 1.84840440e-02
 2.01041390e-02 2.18662326e-02 2.37827706e-02 2.58672900e-02
 2.81345139e-02 3.06004562e-02 3.32825340e-02 3.61996915e-02
 3.93725328e-02 4.28234683e-02 4.65768724e-02 5.06592559e-02
 5.50994534e-02 5.99288267e-02 6.51814863e-02 7.08945326e-02
 7.71083177e-02 8.38667305e-02 9.12175066e-02 9.92125657e-02
 1.07908378e-01 1.17366364e-01 1.27653326e-01 1.38841922e-01
 1.51011181e-01 1.64247053e-01 1.78643028e-01 1.94300785e-01
 2.11330918e-01

accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7228825356572609
00:00:983130


Start-50, c_item=0.05992882671622804
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7227143125494773
00:01:007722


Start-51, c_item=0.06518148634039972
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7213049618269399
00:01:007098


Start-52, c_item=0.0708945326372164
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7205916488883523
00:00:992255


Start-53, c_item=0.07710831771466045
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7205386493848033
00:00:998643


Start-54, c_item=0.08386673047427361
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7206768506219057
00:01:010023


Start-55, c_item=0.09121750660509041
accuracy =  0.2793040293040293
f1 score =  0.2080168189218506
AUC =  0.7209299400233466
00:00:993178


Start-56, c_item=0.09921256574801249
accuracy =  0.2793040