In [1]:
import os
import numpy as np
from time import time
import datetime
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import label_binarize

## (1) load data

In [2]:
filepath ='/home/dell/Xinda/SVM/server/Visual/1_HOG/visual_part2_hog_pca.95_6PNN.csv'
data = pd.read_csv(filepath)
x = data.iloc[:, :-1]  # 数据特征
y = data.iloc[:,-1]  # 标签

# 将数据划分为训练集和测试集，test_size=.3表示30%的测试集, 随机数种子, 保证可复现性
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=423)
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,970,971,972,973,974,975,976,977,978,979
11460,-3.610809,0.181031,2.727854,-0.941375,0.672853,1.070195,0.235489,-0.543350,-0.829860,-0.318676,...,0.049048,-0.068235,-0.064671,0.067637,-0.012281,-0.067218,0.074663,0.002308,0.072913,-0.013657
9610,-1.162428,-2.319746,2.544356,0.793281,1.347436,-1.824822,0.855603,1.427280,-0.349576,0.365939,...,0.036044,-0.077791,0.026761,0.016202,-0.008609,-0.040978,-0.002687,-0.006296,0.022981,0.018608
2186,2.606377,1.070886,-3.198655,-2.085576,-0.435705,-0.566083,-1.007178,1.235195,-2.404571,1.025328,...,-0.017286,-0.101345,-0.050659,-0.050403,-0.063824,-0.042028,-0.090006,0.035064,0.023151,0.101714
9744,4.561780,-1.153944,0.305514,1.241234,-0.017566,-1.480878,-1.259760,0.253054,-0.589190,0.983693,...,-0.058310,0.027396,-0.040016,-0.052323,-0.004030,-0.040622,-0.013122,-0.021193,0.009634,0.005165
10576,2.124794,0.142205,-0.080546,-2.063062,0.866341,2.338448,-0.582162,1.297541,-2.168625,-0.867786,...,-0.025800,-0.021690,-0.024977,0.106156,0.058931,-0.004661,0.010015,-0.013832,-0.024116,0.068163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6170,-5.219104,5.048948,1.344968,1.433352,0.382586,0.138933,-1.248825,-0.492654,0.576605,0.352858,...,-0.025263,0.035595,0.072347,-0.022745,0.080069,-0.045385,0.056829,0.019796,0.009451,-0.021349
10176,-3.680493,-0.978627,0.705475,2.205018,0.192231,-0.635897,1.776641,-1.439958,-0.275041,1.140577,...,-0.000683,0.021049,-0.001623,0.050642,-0.069458,0.064935,0.031490,-0.022713,-0.041155,-0.115849
3482,0.001629,-2.309934,-3.296829,0.544592,-3.328425,-1.253247,-0.654076,1.366200,1.337794,-0.973880,...,-0.022813,-0.007143,-0.042940,-0.014514,0.026099,-0.048444,0.027770,0.078176,-0.032505,0.020063
7451,2.177468,-2.193183,1.687635,1.620445,1.170794,-1.002194,-0.602838,-1.293128,-0.169717,-0.094562,...,0.023281,-0.013745,-0.014041,0.078828,0.015814,0.082901,0.018532,0.001661,0.021490,-0.059198


In [3]:
# 修正测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test ]:
    i.index  = range(i.shape[0])

# Y 将标签编码
encoder = LabelEncoder().fit(y_train) # #训练LabelEncoder, 把y_train中的类别编码为0，1，2，3，4，5
y = encoder.transform(y_train)
print("class = ", y)
y_train = pd.DataFrame(encoder.transform(y_train)) # 使用训练好的LabelEncoder对源数据进行编码
y_test = pd.DataFrame(encoder.transform(y_test))
print("y_test = ", y_test[0:10])

# 标签降维度
y_train = y_train.iloc[:,0].ravel()
y_test = y_test.iloc[:,0].ravel()
print("y.shape=", y_train.shape)

# X标准化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # 标准化
x_test_std = scaler.fit_transform(x_test)

class =  [2 5 0 ... 2 0 0]
y_test =     0
0  1
1  0
2  0
3  3
4  0
5  4
6  5
7  4
8  3
9  4
y.shape= (8490,)


In [7]:
time0 = time()
clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma =  0.0009765625,
        degree=1, 
        cache_size=5000,
        probability=True,
        class_weight='balanced'))
clf.fit(x_train_std, y_train)
    
y_test_prediction = clf.predict(x_test_std)
# accuracy
accuracy = accuracy_score(y_test,y_test_prediction)
print("accuracy = ", accuracy)
# F1-score
f1 = f1_score(y_test,y_test_prediction, average="weighted")
print("f1 score = ", f1)
    
# AUC
y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
result = clf.decision_function(x_test_std)
auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
print("AUC = ", auc)

print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print()

accuracy =  0.8147842813959879
f1 score =  0.8105617423661162
AUC =  0.9647957182128422
19:49:739191



In [8]:
clf.predict_proba(x_test_std)  # 输出分类zhidao概率

array([[0.13693332, 0.58321393, 0.0215648 , 0.07441558, 0.10204588,
        0.0818265 ],
       [0.31981876, 0.24446412, 0.02124643, 0.03131904, 0.19812238,
        0.18502926],
       [0.41383896, 0.00881336, 0.007748  , 0.00866418, 0.09923633,
        0.46169917],
       ...,
       [0.03432759, 0.03021851, 0.01662373, 0.01772152, 0.88344551,
        0.01766313],
       [0.02009031, 0.19728632, 0.73783727, 0.01192205, 0.03046829,
        0.00239576],
       [0.04712273, 0.03548223, 0.08156636, 0.78796752, 0.00629756,
        0.04156361]])

In [9]:
result = clf.predict_proba(x_test_std)

In [11]:
df = pd.DataFrame(result)
df.to_csv("categorical03_hog_6pnn.csv")

In [None]:
clf.predict_log_proba(x_test_std)  # 输出分类概率的对数

In [None]:
df = pd.DataFrame(data={"hog_prediction_p": y_test_prediction, "hog_groundtruth_p": y_test.tolist()})
df.to_csv("eval_hog_6pnn.csv")
print("save success!")

### Gamma-1

In [None]:
import tqdm
times_all = time()
# 调试两个参数 gamma & C  ，默认情况下C为1，通常来说这都是一个合理的参数。
accuracy_list = []
f1_list = []
auc_list = []

gamma_range = np.logspace(-10, 1, 10, base=2) # 返回13个数字，底是2
print("gamma_rang:", gamma_range)

for gamma_item in gamma_range:
    count=1
    time0 = time()
    print("Start-{0}, gamma={1}".format(count, gamma_item))
    count = count+1
    clf = OneVsRestClassifier(
        SVC(kernel = 'rbf', 
        gamma = gamma_item,
        degree=1, 
        cache_size=5000, 
        class_weight='balanced'))
    clf.fit(x_train_std, y_train)
    
    y_test_prediction = clf.predict(x_test_std)
    # accuracy
    accuracy = accuracy_score(y_test,y_test_prediction)
    accuracy_list.append(accuracy)
    print("accuracy = ", accuracy)
    # F1-score
    f1 = f1_score(y_test,y_test_prediction, average="weighted")
    print("f1 score = ", f1)
    f1_list.append(f1)
    # AUC
    y_test_binary = label_binarize(y_test, classes=list(range(6))) # 转化为one-hot
    result = clf.decision_function(x_test_std)
    auc = roc_auc_score(y_test_binary, result, average = 'micro') # 多类分类下，要用概率值（形式二） ，加参数 average='micro'  （不能用ont-hot (形式三) ）
    print("AUC = ", auc)
    auc_list.append(auc)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print()
    print()

print(max(accuracy_list), gamma_range[accuracy_list.index(max(accuracy_list))])
print("F1-score = ", f1_list[accuracy_list.index(max(accuracy_list))])
print("AUC-score = ", auc_list[accuracy_list.index(max(accuracy_list))]) 
print(datetime.datetime.fromtimestamp(time()-times_all).strftime("%M:%S:%f"))

gamma_rang: [9.76562500e-04 2.27837703e-03 5.31558594e-03 1.24015707e-02
 2.89335848e-02 6.75037337e-02 1.57490131e-01 3.67433623e-01
 8.57243983e-01 2.00000000e+00]
Start-1, gamma=0.0009765625
accuracy =  0.8147842813959879
f1 score =  0.8105617423661162
AUC =  0.9647957182128422
40:40:513247


Start-1, gamma=0.0022783770304221013
accuracy =  0.8037922506183017
f1 score =  0.7944480785258063
AUC =  0.9574171929528065
01:50:716114


Start-1, gamma=0.005315585938181161
