In [1]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
import tqdm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import cross_val_score


In [2]:
filepath ='Vggish_featurees_20210322_add0_6pnn.csv'
data = pd.read_csv(filepath)
x = data.iloc[:, 7:]  # 数据特征
y = data.iloc[:,2]  # 标签

# 将数据划分为训练集和测试集，test_size=.3表示30%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=324)

In [3]:
x_train

Unnamed: 0,Vggish_1,Vggish_2,Vggish_3,Vggish_4,Vggish_5,Vggish_6,Vggish_7,Vggish_8,Vggish_9,Vggish_10,...,Vggish_375,Vggish_376,Vggish_377,Vggish_378,Vggish_379,Vggish_380,Vggish_381,Vggish_382,Vggish_383,Vggish_384
720,170.000000,15.500000,220.000000,113.000000,206.000000,103.000000,131.500000,119.000000,138.000000,150.500000,...,338.000000,1152.000000,72.000000,72.00,6728.000000,128.000000,40.500000,5304.500000,2812.500000,0
6402,156.000000,12.000000,189.000000,140.000000,208.500000,88.000000,113.000000,95.500000,125.000000,232.000000,...,0.500000,32.000000,18.000000,3960.50,72.000000,612.500000,8.000000,13778.000000,4.500000,0
3788,155.000000,3.250000,170.500000,95.750000,204.500000,79.000000,107.500000,119.750000,158.500000,185.500000,...,2116.000000,1253.666667,2913.000000,8588.00,11382.916670,844.666667,6564.000000,0.000000,15260.250000,0
5714,166.500000,6.500000,198.500000,107.500000,189.000000,83.500000,110.500000,95.000000,159.000000,100.000000,...,1152.000000,12.500000,882.000000,0.00,4050.000000,10804.500000,98.000000,12.500000,24.500000,0
7868,167.000000,11.666667,214.333333,112.333333,207.666667,113.333333,108.333333,108.666667,159.333333,124.000000,...,2899.000000,559.000000,1322.333333,0.00,1957.000000,926.333333,2028.000000,1641.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,159.000000,9.000000,207.000000,136.500000,209.500000,87.000000,132.000000,97.000000,124.000000,205.500000,...,1104.500000,3872.000000,2664.500000,3698.00,4704.500000,4418.000000,11250.000000,1512.500000,11100.500000,0
600,161.000000,6.000000,194.750000,114.250000,187.500000,81.000000,114.250000,106.500000,146.250000,161.500000,...,4436.666667,3370.916667,929.666667,472.25,5642.250000,738.916667,3611.333333,1356.916667,5390.250000,0
9723,155.750000,4.250000,187.500000,124.750000,199.500000,96.000000,98.250000,120.250000,119.250000,215.000000,...,1466.000000,7231.583333,8294.666667,5016.25,1392.333333,636.666667,4224.250000,10228.666670,3220.916667,0
908,166.333333,10.666667,223.666667,120.666667,200.000000,122.666667,142.000000,95.000000,137.666667,108.666667,...,4933.333333,1158.333333,217.000000,0.00,3766.333333,3250.333333,739.000000,5396.333333,1633.333333,0


In [4]:
y_train

720      Confused
6402      Relaxed
3788        Happy
5714      Anxious
7868      Anxious
           ...   
6416        Happy
600        Neural
9723        Happy
908       Anxious
10905     Anxious
Name: Categorical, Length: 9691, dtype: object

In [5]:
# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# Y 将标签编码
encoder = LabelEncoder().fit(y_train) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
y = encoder.transform(y_train)
# print("class = ", y)
y_train = pd.DataFrame(encoder.transform(y_train)) # 使用训练好的LabelEncoder对源数据进行编码
y_test = pd.DataFrame(encoder.transform(y_test))
# print("y_test = ", y_test[0:10])

# 标签降维度
y_train = y_train.iloc[:,0].ravel()
y_test = y_test.iloc[:,0].ravel()

# X标准化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # 标准化
x_test_std = scaler.fit_transform(x_test)

# Gamma

In [6]:
# Spend Time
time0 = time()

# Basic SVM Model (gamma =  0.04888888888888889,  C = 1.13333333333,)      
# sklearn通过OneVsRestClassifier实现svm.SVC的多分类
clf = OneVsRestClassifier(SVC(kernel = 'rbf', 
        degree=1,
        C=1, # default
        cache_size=5000, 
        probability=True,
        class_weight='balanced'))


# 超参数 Gamma
gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print(gamma_range)

parameters = {
"estimator__C": [1],
"estimator__kernel": ["rbf"],
"estimator__degree":[1],
"estimator__gamma":gamma_range,
}

# evaluation Metrics
score = 'accuracy'

# Grid Search params
model_tunning = GridSearchCV(clf, 
                             param_grid=parameters,
                             n_jobs=-1, 
                             cv=5,
                             verbose = 32,
                             scoring=score)
model_tunning.fit(x_train_std, y_train)

[9.76562500e-04 2.27837703e-03 5.31558594e-03 1.24015707e-02
 2.89335848e-02 6.75037337e-02 1.57490131e-01 3.67433623e-01
 8.57243983e-01 2.00000000e+00]
Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(cv=5,
             estimator=OneVsRestClassifier(estimator=SVC(C=1, cache_size=5000,
                                                         class_weight='balanced',
                                                         degree=1,
                                                         probability=True)),
             n_jobs=-1,
             param_grid={'estimator__C': [1], 'estimator__degree': [1],
                         'estimator__gamma': array([9.76562500e-04, 2.27837703e-03, 5.31558594e-03, 1.24015707e-02,
       2.89335848e-02, 6.75037337e-02, 1.57490131e-01, 3.67433623e-01,
       8.57243983e-01, 2.00000000e+00]),
                         'estimator__kernel': ['rbf']},
             scoring='accuracy', verbose=32)

In [7]:
bst = model_tunning.best_estimator_
preds = bst.predict(x_test)
y_test_prediction = bst.predict(x_test)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = bst.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

accuracy =  0.34378869170449855
f1 score =  0.18301489560142029
AUC =  0.7647821665020986
00:51:612632



In [8]:
result = bst.predict_proba(x_test_std)
df = pd.DataFrame(result)
df.to_csv("./results_0327/categorical_vggish_6pnn_20210327_prediction.csv")
df2 = pd.DataFrame(y_test)
df2.to_csv("./results_0327/categorical_vggish_6pnn_20210327_GT.csv")
print("save success!")

save success!
