In [2]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
import tqdm

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import cross_val_score

import logging
logger = logging.getLogger(__name__)
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler("OpenSMILE_Classificaiton_log.txt")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# 一、Load Data

In [None]:
for i in range(1,10):
    
    # ----------
    # load data
    # ----------
    train_x = pd.read_csv(f'./CV_Features/ClassificationFeatures/Train_CV_{i}.csv').iloc[:,8:]
    train_y = pd.read_csv(f'./CV_Features/ClassificationFeatures/Train_CV_{i}.csv').iloc[:,3]
    validation_x = pd.read_csv(f'./CV_Features/ClassificationFeatures/Validation_CV_{i}.csv').iloc[:,8:]
    validation_y = pd.read_csv(f'./CV_Features/ClassificationFeatures/Validation_CV_{i}.csv').iloc[:,3]
    test_x = pd.read_csv(f'./CV_Features/ClassificationFeatures/Test_CV_{i}.csv').iloc[:,8:]
    test_y = pd.read_csv(f'./CV_Features/ClassificationFeatures/Test_CV_{i}.csv').iloc[:,3]
    
    
    # Y 将标签编码
    encoder = LabelEncoder().fit(train_y) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
    y = encoder.transform(train_y)
    y_train = pd.DataFrame(encoder.transform(train_y)) # 使用训练好的LabelEncoder对源数据进行编码
    y_valid = pd.DataFrame(encoder.transform(validation_y))
    y_test = pd.DataFrame(encoder.transform(test_y))

    # 标签降维度
    y_train = y_train.iloc[:,0].ravel()
    y_valid = y_valid.iloc[:,0].ravel()
    y_test = y_test.iloc[:,0].ravel()

    # X标准化
    scaler = StandardScaler()
    x_train_std = scaler.fit_transform(train_x)
    x_valid_std = scaler.fit_transform(validation_x)
    x_test_std = scaler.fit_transform(test_x)
    
    # Spend Time
    time0 = time()
    
    # result 
    accuracy_list = []
    f1_list = []
    auc_list = []

    
    # 超参数 Gamma
    gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
    print(gamma_range)
    logger.info(gamma_range)
    for idx, gamma in enumerate(gamma_range):
        # sklearn通过OneVsRestClassifier实现svm.SVC的多分类
        
        # ------------
        # Training 
        # ------------
        clf = OneVsRestClassifier(
            SVC(kernel = 'rbf',  # 
                gamma = gamma,
                C=1, # default
                degree=1,
                cache_size=5000, 
                probability=True,
                class_weight='balanced'))
        print(f">>>>>>>Start Trainng {idx}/{len(gamma_range)}>>>>>>>")
        logger.info(f">>>>>>>Start Trainng {idx}/{len(gamma_range)}>>>>>>>")
        clf.fit(x_train_std,y_train)
        print(">>>>>>>Over Trainng>>>>>>>")
        logger.info(">>>>>>>Over Trainng>>>>>>>")
    
        # ------------
        # Fine-tuning
        # ------------
        y_valid_prediction = clf.predict(x_valid_std)
        
        # accuracy
        accuracy = accuracy_score(y_valid,y_valid_prediction)
        accuracy_list.append(accuracy)
        print("accuracy = ", accuracy)
        logger.info(f"accuracy = {accuracy}")
        # F1-score
        f1 = f1_score(y_valid,y_valid_prediction, average="weighted")
        f1_list.append(f1)
        print("f1 score = ", f1)
        logger.info(f"f1 score =  {f1}")

        # AUC
        y_valid_binary = label_binarize(y_valid, classes=list(range(6))) # 转化为one-hot
        result = clf.decision_function(x_valid_std)
        auc = roc_auc_score(y_valid_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
        auc_list.append(auc)
        print("AUC = ", auc)
        logger.info(f"AUC =   {auc}")
        print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    
    print("Gamma = ", gamma_range[accuracy_list.index(max(accuracy_list))])
    logger.info(f"Beat Gamma =    {gamma_range[accuracy_list.index(max(accuracy_list))]}")
    print("Accuracy = ", max(accuracy_list))
    logger.info(f"Best Acc. = {max(accuracy_list)}")
    print("F1-score = ", f1_list[accuracy_list.index(max(accuracy_list))])
    logger.info(f"Best F1-Score. = {f1_list[accuracy_list.index(max(accuracy_list))]}")
    print("AUC-score = ", auc_list[accuracy_list.index(max(accuracy_list))]) 
    logger.info(f"Best AUC. = {auc_list[accuracy_list.index(max(accuracy_list))]}")
    break

[9.76562500e-04 2.27837703e-03 5.31558594e-03 1.24015707e-02
 2.89335848e-02 6.75037337e-02 1.57490131e-01 3.67433623e-01
 8.57243983e-01 2.00000000e+00]
>>>>>>>Start Trainng 0/10>>>>>>>


In [2]:
# features and GT
filepath ='Speaker_Audio_opensmile_6pnn.csv'
data = pd.read_csv(filepath)

# features
x = data.iloc[:, 7:]  # 数据特征

#GT
y = data.iloc[:,2]  # 标签

# 将数据划分为训练集和测试集，test_size=.2表示20%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=324)

In [3]:
x_train

Unnamed: 0,opensmile_1,opensmile_2,opensmile_3,opensmile_4,opensmile_5,opensmile_6,opensmile_7,opensmile_8,opensmile_9,opensmile_10,...,opensmile_79,opensmile_80,opensmile_81,opensmile_82,opensmile_83,opensmile_84,opensmile_85,opensmile_86,opensmile_87,opensmile_88
720,32.55397,0.082655,31.23685,32.61992,34.61238,3.375534,44.436700,51.57228,21.34766,7.153147,...,0.005480,-0.015594,0.036465,5.627706,3.539823,0.211250,0.155035,0.051250,0.025709,-48.77987
6402,32.00434,0.111577,28.55338,32.41410,34.32557,5.772186,85.421750,104.56340,25.72923,17.601850,...,-0.068029,-0.005903,0.062938,4.285715,0.490196,1.820000,0.000000,0.200000,0.000000,-34.28742
3788,21.16741,0.146642,18.63361,20.49452,24.58158,5.947968,0.674986,0.00000,0.00000,0.000000,...,-0.017042,-0.001552,0.021021,1.136364,0.459770,0.045000,0.005000,1.045000,1.298855,-60.95603
5714,36.61082,0.106690,35.03804,36.70648,38.29864,3.260601,85.312820,84.90221,74.71027,67.060180,...,-0.010725,-0.005818,0.020776,1.923077,2.352941,0.168333,0.097197,0.470000,0.509706,-48.36210
7868,32.88428,0.160811,27.70544,35.47149,36.35062,8.645187,326.178700,453.37200,47.10888,28.570710,...,0.001275,-0.016488,0.030348,4.687500,2.229300,0.320000,0.165616,0.125000,0.100457,-47.46463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,27.28275,0.205768,23.50034,26.39191,31.78178,8.281441,311.029500,244.23690,94.43201,42.076830,...,-0.050569,-0.020428,0.125648,3.765691,2.586207,0.288333,0.311952,0.078333,0.034359,-44.00177
600,35.15535,0.166328,31.01224,37.14463,38.70065,7.688412,214.615600,200.04930,79.19790,41.454620,...,0.008144,-0.007408,0.020975,2.603037,1.758242,0.161250,0.070078,0.390000,0.643078,-49.10515
9723,27.50911,0.108251,25.68822,27.56144,29.67979,3.991573,185.285600,231.66320,50.06115,40.773690,...,-0.003432,-0.002454,0.029866,2.985075,1.727862,0.371250,0.222735,0.185000,0.263201,-43.26650
908,35.91882,0.166875,34.19972,38.27375,39.61871,5.418987,213.023800,209.55730,99.47053,111.068300,...,0.001779,-0.017245,0.072958,4.207120,2.631579,0.288750,0.236296,0.091667,0.056691,-40.64891


In [4]:
y_train

720      Confused
6402      Relaxed
3788        Happy
5714      Anxious
7868      Anxious
           ...   
6416        Happy
600        Neural
9723        Happy
908       Anxious
10905     Anxious
Name: Categorical, Length: 9691, dtype: object

In [5]:
# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# Y 将标签编码
encoder = LabelEncoder().fit(y_train) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
y = encoder.transform(y_train)
y_train = pd.DataFrame(encoder.transform(y_train)) # 使用训练好的LabelEncoder对源数据进行编码
y_test = pd.DataFrame(encoder.transform(y_test))

# 标签降维度
y_train = y_train.iloc[:,0].ravel()
y_test = y_test.iloc[:,0].ravel()

# X标准化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # 标准化
x_test_std = scaler.fit_transform(x_test)

# Gamma

In [6]:
# Spend Time
time0 = time()

# Basic SVM Model (gamma =  0.04888888888888889,  C = 1.13333333333,)      
# sklearn通过OneVsRestClassifier实现svm.SVC的多分类
clf = OneVsRestClassifier(SVC(kernel = 'rbf', 
        degree=1,
        C=1, # default
        cache_size=5000, 
        probability=True,
        class_weight='balanced'))


# 超参数 Gamma
gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print(gamma_range)

parameters = {
"estimator__C": [1],
"estimator__kernel": ["rbf"],
"estimator__degree":[1],
"estimator__gamma":gamma_range,
}

# evaluation Metrics
score = 'accuracy'

# Grid Search params
model_tunning = GridSearchCV(clf, 
                             param_grid=parameters,
                             n_jobs=-1, 
                             cv=5,
                             verbose = 32,
                             scoring=score)
model_tunning.fit(x_train_std, y_train)

[9.76562500e-04 2.27837703e-03 5.31558594e-03 1.24015707e-02
 2.89335848e-02 6.75037337e-02 1.57490131e-01 3.67433623e-01
 8.57243983e-01 2.00000000e+00]
Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(cv=5,
             estimator=OneVsRestClassifier(estimator=SVC(C=1, cache_size=5000,
                                                         class_weight='balanced',
                                                         degree=1,
                                                         probability=True)),
             n_jobs=-1,
             param_grid={'estimator__C': [1], 'estimator__degree': [1],
                         'estimator__gamma': array([9.76562500e-04, 2.27837703e-03, 5.31558594e-03, 1.24015707e-02,
       2.89335848e-02, 6.75037337e-02, 1.57490131e-01, 3.67433623e-01,
       8.57243983e-01, 2.00000000e+00]),
                         'estimator__kernel': ['rbf']},
             scoring='accuracy', verbose=32)

In [7]:
bst = model_tunning.best_estimator_
preds = bst.predict(x_test)
y_test_prediction = bst.predict(x_test)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = bst.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

accuracy =  0.34667767230705737
f1 score =  0.1784917221807712
AUC =  0.8291496967515702
44:37:603033



In [8]:
result = bst.predict_proba(x_test_std)
df = pd.DataFrame(result)
df.to_csv("./results_0327/categorical_opensmile_6pnn_20210327_prediction.csv")
df2 = pd.DataFrame(y_test)
df2.to_csv("./results_0327/categorical_opensmile_6pnn_20210327_GT.csv")
print("save success!")

save success!
