In [None]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
import tqdm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import cross_val_score

In [None]:
filepath ='Speaker_Visual_Hog_6PNN.csv'
data = pd.read_csv(filepath)
x = data.iloc[:, 6:]  # 数据特征
y = data.iloc[:,1]  # 标签
# 将数据划分为训练集和测试集，test_size=.3表示30%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=324)

In [None]:
x_train

In [None]:
y_train

In [None]:
# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# Y 将标签编码
encoder = LabelEncoder().fit(y_train) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
y = encoder.transform(y_train)
# print("class = ", y)
y_train = pd.DataFrame(encoder.transform(y_train)) # 使用训练好的LabelEncoder对源数据进行编码
y_test = pd.DataFrame(encoder.transform(y_test))
# print("y_test = ", y_test[0:10])

# 标签降维度
y_train = y_train.iloc[:,0].ravel()
y_test = y_test.iloc[:,0].ravel()

# X标准化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # 标准化
x_test_std = scaler.fit_transform(x_test)

# Spend Time
time0 = time()

# Basic SVM Model (gamma =  0.04888888888888889,  C = 1.13333333333,)      
# sklearn通过OneVsRestClassifier实现svm.SVC的多分类
clf = OneVsRestClassifier(SVC(kernel = 'rbf', 
        degree=1,
        C=1, # default
        cache_size=5000,
        probability=True,
        class_weight='balanced'))


# 超参数 Gamma
gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print(gamma_range)

parameters = {
"estimator__C": [1],
"estimator__kernel": ["rbf"],
"estimator__degree":[1],
"estimator__gamma":gamma_range,
}

# evaluation Metrics
score = 'accuracy'

# Grid Search params
model_tunning = GridSearchCV(clf, 
                             param_grid=parameters,
                             n_jobs=-1, 
                             cv=5,
                             verbose = 32,
                             scoring=score)
model_tunning.fit(x_train_std, y_train)

In [None]:
print(model_tunning.best_score_)
print(model_tunning.best_params_)
bst = model_tunning.best_estimator_
preds = bst.predict(x_test)
y_test_prediction = bst.predict(x_test)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = bst.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

In [None]:
result = bst.predict_proba(x_test_std)
df = pd.DataFrame(result)
df.to_csv("./results_0327/categorical_hog_6pnn_20210327_prediction.csv")
df2 = pd.DataFrame(y_test)
df2.to_csv("./results_0327/categorical_hog_6pnn_20210327_GT.csv")
print("save success!")