In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [10]:
data = pd.read_csv('./voice.csv')
data.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [11]:
#检查数据是否存在空值
data.isnull().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64

In [12]:
#检查数据是否存在categories型数据
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

In [13]:
y = data['label']
X = data.drop(columns = ['label'])

In [14]:
#对文本标签进行0，1编码
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
stander = StandardScaler()
X = stander.fit_transform(X)

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 2021)

In [18]:
#将逻辑回归作为第一个分类器
clf_1 = LogisticRegression()
clf_1.fit(X_train,y_train) #利用训练集训练分类器
clf_1_y_pred = clf_1.predict(X_test) #对测试集做预测
clf_1_train_score = clf_1.score(X_train,y_train) #获取在训练集的ACC分数
clf_1_test_score = clf_1.score(X_test,y_test) #获取在测试集的ACC分数
print(f'训练集分数为{clf_1_train_score}')
print(f'测试集分数为{clf_1_test_score}')

训练集分数为0.972936400541272
测试集分数为0.9779179810725552


In [19]:
#将SVC作为第二个分类器
clf_2 = SVC(probability=True)
clf_2.fit(X_train,y_train) #利用训练集训练分类器
clf_2_y_pred = clf_2.predict(X_test) #对测试集做预测
clf_2_train_score = clf_2.score(X_train,y_train) #获取在训练集的ACC分数
clf_2_test_score = clf_2.score(X_test,y_test) #获取在测试集的ACC分数
print(f'训练集分数为{clf_2_train_score}')
print(f'测试集分数为{clf_2_test_score}')

训练集分数为0.9833107803337844
测试集分数为0.9863301787592008


In [20]:
#将LinearSVC作为第三个分类器
clf_3 = LinearSVC()
#利用训练集训练分类器
clf_3.fit(X_train,y_train)
clf_3_y_pred = clf_3.predict(X_test)
clf_3_train_score = clf_3.score(X_train,y_train)
clf_3_test_score = clf_3.score(X_test,y_test)
print(f'训练集分数为{clf_3_train_score}')
print(f'测试集分数为{clf_3_test_score}')

训练集分数为0.9742895805142084
测试集分数为0.9726603575184016




In [21]:
#将KNeighborsClassifier作为第四个分类器
clf_4 = KNeighborsClassifier(algorithm='ball_tree')
#利用训练集训练分类器
clf_4.fit(X_train,y_train)
clf_4_y_pred = clf_4.predict(X_test)
clf_4_train_score = clf_4.score(X_train,y_train)
clf_4_test_score = clf_4.score(X_test,y_test)
print(f'训练集分数为{clf_4_train_score}')
print(f'测试集分数为{clf_4_test_score}')

训练集分数为0.979702300405954
测试集分数为0.9737118822292324


In [333]:
import time
start = time.time()
#将XGBoost作为第五个分类器
clf_5 = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=2000, 
            objective='binary:logistic', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='auc', reg_lambda=0.5
)
#利用训练集训练分类器
clf_5.fit(X_train,y_train)
clf_5_y_pred = clf_5.predict(X_test)
clf_5_train_score = clf_5.score(X_train,y_train)
clf_5_test_score = clf_5.score(X_test,y_test)
print(f'训练集分数为{clf_5_train_score}')
print(f'测试集分数为{clf_5_test_score}')
print(f'XGB所花费的时间为{time.time() - start}')



Parameters: { min_child_samples } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


训练集分数为1.0
测试集分数为0.9842271293375394
XGB所花费的时间为0.9241983890533447


In [334]:
import time
start = time.time()
#将lGBM作为第六个分类器
clf_6 = LGBMClassifier(num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1,
)
#利用训练集训练分类器
clf_6.fit(X_train,y_train)
clf_6_y_pred = clf_6.predict(X_test)
clf_6_train_score = clf_6.score(X_train,y_train)
clf_6_val_score = clf_6.score(X_test,y_test)
print(f'训练集分数为{clf_6_train_score}')
print(f'测试集分数为{clf_6_val_score}')
print(f'LGB所花费的时间为{time.time() - start}')

训练集分数为1.0
测试集分数为0.982124079915878
LGB所花费的时间为1.6535413265228271
