In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split

In [2]:
audio_data = pd.read_csv("OpenSMILE_all_features.csv")
y = audio_data['label']
columns_to_drop = ['label', 'relative_path', 'file', 'speaker', 'gender']
X = audio_data.drop(columns=columns_to_drop, axis=1)
Strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X.head()


Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,40.3484,0.053911,39.33356,40.792065,41.64678,2.313217,57.45576,24.437008,24.974644,19.391956,...,0.019291,-0.008166,0.209579,4.276316,3.67893,0.171818,0.117072,0.0725,0.067593,-17.525782
1,39.34742,0.041715,38.04548,39.04827,41.210155,3.164677,30.927359,12.514604,16.857141,11.613185,...,-0.004749,0.002656,0.098442,4.867257,1.809955,0.395,0.195,0.1325,0.103773,-16.51213
2,39.067787,0.055762,36.569603,39.28283,41.49211,4.922508,21.31611,15.289312,9.042774,3.699957,...,-0.000383,0.002512,0.102126,4.232804,2.717392,0.216,0.225973,0.108333,0.067185,-17.319422
3,39.115086,0.040424,38.25024,39.439083,40.236988,1.986748,126.51777,47.699543,10.574628,5.34894,...,0.003987,0.007763,0.115426,4.184101,1.709402,0.4075,0.168133,0.1525,0.07293,-16.916288
4,39.199295,0.047696,37.388203,39.238194,41.075527,3.687324,96.39738,99.2372,22.159834,15.850734,...,-0.000803,0.006013,0.161132,4.508197,2.92887,0.2,0.174602,0.136667,0.119117,-18.429873


In [3]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

In [4]:
model_rf = RandomForestClassifier(random_state=42)
rfecv_rf  = RFECV(estimator= model_rf, cv= Strat_kfold, scoring="accuracy", n_jobs=-1) # Instantiate the RFECV and its parameters, n_jobs=-1 means that use all the processors
model_rf_fit = rfecv_rf.fit(X, y)
print("Optimal number of features for Random Forest Classifier: %d" % model_rf_fit.n_features_)

Optimal number of features for Random Forest Classifier: 3


In [5]:
model_rf = RandomForestClassifier(random_state=42)
rfe_rf = RFE(estimator=model_rf, n_features_to_select=rfecv_rf.n_features_)
model_rf_fit_rfe = rfe_rf.fit(X, y)

In [6]:
feature_ranking_RF = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_rf_fit_rfe.ranking_
})
feature_ranking = feature_ranking_RF.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_rf.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
81,loudnessPeaksPerSec,1
85,MeanUnvoicedSegmentLength,1


In [7]:
model_dt = DecisionTreeClassifier(random_state=42)
rfecv_dt  = RFECV(estimator= model_dt, cv= Strat_kfold, scoring="accuracy", n_jobs=-1)
model_dt_fit = rfecv_dt.fit(X, y)
print("Optimal number of features for Decision Tree Classifier: %d" % rfecv_dt.n_features_)

Optimal number of features for Decision Tree Classifier: 2


In [8]:
model_dt = DecisionTreeClassifier(random_state=42)
rfe_dt = RFE(estimator=model_dt, n_features_to_select=rfecv_dt.n_features_)
model_dt_fit_rfe = rfe_dt.fit(X, y)

In [9]:
feature_ranking_dt = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_dt_fit_rfe.ranking_
})
feature_ranking = feature_ranking_dt.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_dt.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
85,MeanUnvoicedSegmentLength,1


In [10]:
model_gbc = GradientBoostingClassifier(random_state=42)
rfecv_gbc  = RFECV(estimator= model_gbc, cv= Strat_kfold, scoring="accuracy", n_jobs=-1)
model_gbc_fit = rfecv_gbc.fit(X, y)
print("Optimal number of features for Gradient Boosting Classifier: %d" % rfecv_gbc.n_features_)

Optimal number of features for Gradient Boosting Classifier: 2


In [11]:
model_gbc = GradientBoostingClassifier(random_state=42)
rfe_gbc = RFE(estimator=model_gbc, n_features_to_select=rfecv_gbc.n_features_)
model_gbc_fit_rfe = rfe_gbc.fit(X, y)

In [12]:
feature_ranking_gbc = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_gbc_fit_rfe.ranking_
})
feature_ranking = feature_ranking_gbc.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_gbc.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
85,MeanUnvoicedSegmentLength,1


In [13]:
model_etc = ExtraTreesClassifier(random_state=42)
rfecv_etc  = RFECV(estimator= model_etc, cv= Strat_kfold, scoring="accuracy", n_jobs=-1)
model_etc_fit = rfecv_etc.fit(X, y)
print("Optimal number of features for Gradient Boosting Classifier: %d" % rfecv_etc.n_features_)

Optimal number of features for Gradient Boosting Classifier: 3


In [14]:
model_etc = ExtraTreesClassifier(random_state=42)
rfe_etc = RFE(estimator=model_gbc, n_features_to_select=rfecv_etc.n_features_)
model_etc_fit_rfe = rfe_etc.fit(X, y)

In [15]:
feature_ranking_etc = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_etc_fit_rfe.ranking_
})
feature_ranking = feature_ranking_gbc.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_etc.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
85,MeanUnvoicedSegmentLength,1
79,slopeUV500-1500_sma3nz_amean,2
