In [1]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import label_binarize

filepath ='/home/dell/Xinda/SVM/server/Audio/data_vggish/vggish_6pnn_add0.csv'
data = pd.read_csv(filepath)
x = data.iloc[:, :-1]  # 数据特征
y = data.iloc[:,-1]  # 标签

# 将数据划分为训练集和测试集，test_size=.3表示30%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=423)
x_train

# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# Y 将标签编码
encoder = LabelEncoder().fit(y_train) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
y = encoder.transform(y_train)
print("class = ", y)
y_train = pd.DataFrame(encoder.transform(y_train)) # 使用训练好的LabelEncoder对源数据进行编码
y_test = pd.DataFrame(encoder.transform(y_test))
print("y_test = ", y_test[0:10])

# 标签降维度
y_train = y_train.iloc[:,0].ravel()
y_test = y_test.iloc[:,0].ravel()
print("y.shape=", y_train.shape)

# X标准化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # 标准化
x_test_std = scaler.fit_transform(x_test)

class =  [2 5 0 ... 2 0 0]
y_test =     0
0  1
1  0
2  0
3  3
4  0
5  4
6  5
7  4
8  3
9  4
y.shape= (8490,)


## Gamma

In [2]:
time0 = time()
clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma =  0.005315585938181161,
            C = 1.13333333333,
        degree=1, 
        probability=True,
        cache_size=5000, 
        class_weight='balanced'))
clf.fit(x_train_std, y_train)
    
y_test_prediction = clf.predict(x_test_std)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = clf.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

accuracy =  0.4256663918658972
f1 score =  0.4036103678304959
AUC =  0.7582464735600354
07:46:672626



In [3]:
result = clf.predict_proba(x_test_std)
df = pd.DataFrame(result)
df.to_csv("categorical03_vggish_6pnn.csv")
print("save success!")

save success!


In [None]:
time0 = time()
clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma =  0.005315585938181161,
            C = 1.2599210498948,
        degree=1, 
        probability=True,
        cache_size=5000, 
        class_weight='balanced'))
clf.fit(x_train_std, y_train)
    
y_test_prediction = clf.predict(x_test_std)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = clf.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

In [None]:
time0 = time()
clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma =  0.005315585938181161,
            C = 1.0,
        degree=1, 
        probability=True,
        cache_size=5000, 
        class_weight='balanced'))
clf.fit(x_train_std, y_train)
    
y_test_prediction = clf.predict(x_test_std)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = clf.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

In [None]:
import tqdm
times_all = time()
# 调试两个参数 gamma & C  ，默认情况下C为1，通常来说这都是一个合理的参数。
accuracy_list = []
f1_list = []
auc_list = []

gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print("gamma_rang:", gamma_range)

for gamma_item in gamma_range:
    count=1
    time0 = time()
    print("Start-{0}, gamma={1}".format(count, gamma_item))
    count = count+1
    clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma = gamma_item,
        degree=1, 
        cache_size=5000, 
        class_weight='balanced'))
    clf.fit(x_train_std, y_train)
    
    y_test_prediction = clf.predict(x_test_std)
    # accuracy
    accuracy = accuracy_score(y_test,y_test_prediction)
    accuracy_list.append(accuracy)
    print("accuracy = ", accuracy)
    # F1-score
    f1 = f1_score(y_test,y_test_prediction, average="weighted")
    print("f1 score = ", f1)
    f1_list.append(f1)
    # AUC
    y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
    result = clf.decision_function(x_test_std)
    auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
    print("AUC = ", auc)
    auc_list.append(auc)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print()
    print()

print(max(accuracy_list), gamma_range[accuracy_list.index(max(accuracy_list))])
print("F1-score = ", f1_list[accuracy_list.index(max(accuracy_list))])
print("AUC-score = ", auc_list[accuracy_list.index(max(accuracy_list))]) 
print(datetime.datetime.fromtimestamp(time()-times_all).strftime("%M:%S:%f"))

In [None]:
import tqdm
times_all = time()
# 调试两个参数 gamma & C  ，默认情况下C为1，通常来说这都是一个合理的参数。
accuracy_list = []
f1_list = []
auc_list = []

gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print("gamma_rang:", gamma_range)

for gamma_item in gamma_range:
    count=1
    time0 = time()
    print("Start-{0}, gamma={1}".format(count, gamma_item))
    count = count+1
    clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma = gamma_item,
        degree=1, 
        cache_size=5000, 
        class_weight='balanced'))
    clf.fit(x_train_std, y_train)
    
    y_test_prediction = clf.predict(x_test_std)
    # accuracy
    accuracy = accuracy_score(y_test,y_test_prediction)
    accuracy_list.append(accuracy)
    print("accuracy = ", accuracy)
    # F1-score
    f1 = f1_score(y_test,y_test_prediction, average="weighted")
    print("f1 score = ", f1)
    f1_list.append(f1)
    # AUC
    y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
    result = clf.decision_function(x_test_std)
    auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
    print("AUC = ", auc)
    auc_list.append(auc)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print()
    print()

print(max(accuracy_list), gamma_range[accuracy_list.index(max(accuracy_list))])
print("F1-score = ", f1_list[accuracy_list.index(max(accuracy_list))])
print("AUC-score = ", auc_list[accuracy_list.index(max(accuracy_list))]) 
print(datetime.datetime.fromtimestamp(time()-times_all).strftime("%M:%S:%f"))

gamma_rang: [9.76562500e-04 2.27837703e-03 5.31558594e-03 1.24015707e-02
 2.89335848e-02 6.75037337e-02 1.57490131e-01 3.67433623e-01
 8.57243983e-01 2.00000000e+00]
Start-1, gamma=0.0009765625
accuracy =  0.3437757625721352
f1 score =  0.3503221732868482
AUC =  0.6989527289060581
22:32:761230


Start-1, gamma=0.0022783770304221013
accuracy =  0.39433910414949164
f1 score =  0.3884344559175234
AUC =  0.7310131811485313
21:03:972387


Start-1, gamma=0.005315585938181161
