In [1]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd

In [2]:
filepath ='/home/dell/Xinda/SVM/server/Audio/data_opensmile/eGeMAPs_Dominance.csv'
data = pd.read_csv(filepath)
x = data.iloc[:, :-1]  # 数据特征
y = data.iloc[:,-1]  # 标签

# 将数据划分为训练集和测试集，test_size=.3表示30%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=420)

# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# 标准化
scaler_x = StandardScaler()
# scaler_y = StandardScaler()
x_train_std = scaler_x.fit_transform(x_train)
x_test_std = scaler_x.fit_transform(x_test)
# y_train = scaler_y.fit_transform(np.array(y_train).reshape([-1,1])).reshape(-1)
# y_test = scaler_y.fit_transform(np.array(y_test).reshape([-1,1])).reshape(-1)
print(x_train_std.shape)
print(y_train.shape)

(10244, 88)
(10244,)


In [3]:
def getPvar(vals, mean):
    N = len(vals)
    su = 0
    for i in range(len(vals)):
        su = su + ((vals[i]-mean)*(vals[i]-mean))
    pvar = (1/N) * su
    return pvar

def getMean(vals):
    su = 0
    for i in range(len(vals)):
        su = su + vals[i]
    mean = su/(len(vals))
    return mean

def getMeanofDiffs(xvals, yvals):
    su = 0
    for i in range(len(xvals)):
        su = su + ((xvals[i] - yvals[i])*(xvals[i] - yvals[i]))
    meanodiffs = su/(len(xvals))
    return meanodiffs

def getCCC(pvarfe,pvarexp,meanofdiff,meanfe,meanexp):
    bottom = pvarfe + pvarexp + ((meanfe - meanexp)*(meanfe - meanexp))
    answer = 1 - (meanofdiff / bottom)
    return answer

In [4]:
RMSE_list = []
Spearman_list = []
CCC_list = []
clf = SVR(kernel = 'rbf', gamma = 0.028933584758977834, cache_size=5000)
clf.fit(x_train_std, y_train)
result = clf.predict(x_test_std)
# accuracy
rmse = sqrt(mean_squared_error(y_test, result))
print("(1) Evaluation - RMSE = ", rmse)

# Spearman
data = {'result':result, 'y_test':y_test}
df = pd.DataFrame(data, columns=['result','y_test'])
spearman = df.corr(method="spearman" )
print("(2) Evaluation - Spearmman = \n", spearman)

# CCC
prediction = result
ground = y_test
meanfe = getMean(ground)
meanexp = getMean(prediction)
meanofdiff = getMeanofDiffs(ground,prediction)
pvarfe = getPvar(ground, meanfe)
pvarexp = getPvar(prediction, meanexp)
ccc = getCCC(pvarfe,pvarexp,meanofdiff,meanfe,meanexp)
print('(3) Evaluation - CCC =  ' + str(ccc))

df = pd.DataFrame(data={"opensmile_prediction_d": prediction, "opensmile_groundtruth_d": y_test.values.tolist()})
df.to_csv("eval_opensmile_dominance.csv")
print("save success!")

(1) Evaluation - RMSE =  0.23301819253262135
(2) Evaluation - Spearmman = 
          result   y_test
result  1.00000  0.34908
y_test  0.34908  1.00000
(3) Evaluation - CCC =  0.41874196779468087
save success!


### Gamma

In [None]:
times_all = time()
# 调试两个参数 gamma & C  ，默认情况下C为1，通常来说这都是一个合理的参数。
RMSE_list = []
Spearman_list = []
CCC_list = []

gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print("gamma_rang:", gamma_range)

for gamma_item in gamma_range:
    count=1
    time0 = time()
    print("Start-{0}, gamma={1}".format(count, gamma_item))
    count = count+1
    clf = SVR(kernel = 'rbf', gamma = gamma_item, cache_size=5000)
    clf.fit(x_train_std, y_train)
    
    result = clf.predict(x_test_std)
    # accuracy
    rmse = sqrt(mean_squared_error(y_test, result))
    RMSE_list.append(rmse)
    print("(1) Evaluation - RMSE = ", rmse)
    
    # Spearman
    data = {'result':result, 'y_test':y_test}
    df = pd.DataFrame(data, columns=['result','y_test'])
    spearman = df.corr(method="spearman" )
    print("(2) Evaluation - Spearmman = \n", spearman)
    Spearman_list.append(spearman)
    
    # CCC
    prediction = result
    ground = y_test
    meanfe = getMean(ground)
    meanexp = getMean(prediction)
    meanofdiff = getMeanofDiffs(ground,prediction)
    pvarfe = getPvar(ground, meanfe)
    pvarexp = getPvar(prediction, meanexp)
    ccc = getCCC(pvarfe,pvarexp,meanofdiff,meanfe,meanexp)
    CCC_list.append(ccc)
    print('(3) Evaluation - CCC =  ' + str(ccc))
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print()
    print()

print("Gamma = ", gamma_range[CCC_list.index(max(CCC_list))])
print("RMSE = ", RMSE_list[CCC_list.index(max(CCC_list))])
print("Spearman = ", Spearman_list[CCC_list.index(max(CCC_list))])
print("CCC = ", max(CCC_list) ) 
print(datetime.datetime.fromtimestamp(time()-times_all).strftime("%M:%S:%f"))
print("Test over")

gamma_rang: [9.76562500e-04 2.27837703e-03 5.31558594e-03 1.24015707e-02
 2.89335848e-02 6.75037337e-02 1.57490131e-01 3.67433623e-01
 8.57243983e-01 2.00000000e+00]
Start-1, gamma=0.0009765625
(1) Evaluation - RMSE =  0.2663717707941254
(2) Evaluation - Spearmman = 
           result    y_test
result  1.000000  0.149081
y_test  0.149081  1.000000
(3) Evaluation - CCC =  0.004494714508979203
01:40:372568


Start-1, gamma=0.0022783770304221013
(1) Evaluation - RMSE =  0.26438796797029884
(2) Evaluation - Spearmman = 
           result    y_test
result  1.000000  0.145784
y_test  0.145784  1.000000
(3) Evaluation - CCC =  0.020203526205654576
01:36:689892


Start-1, gamma=0.005315585938181161


In [None]:
0