In [4]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split

In [5]:
audio_data = pd.read_csv("OpenSMILE_all_features.csv")
y = audio_data['label']
columns_to_drop = ['label', 'relative_path', 'file', 'speaker', 'gender']
X = audio_data.drop(columns=columns_to_drop, axis=1)
Strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X.head()


Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,40.3484,0.053911,39.33356,40.792065,41.64678,2.313217,57.45576,24.437008,24.974644,19.391956,...,0.019291,-0.008166,0.209579,4.276316,3.67893,0.171818,0.117072,0.0725,0.067593,-17.525782
1,41.504837,0.054985,39.371902,41.949738,43.53197,4.160069,30.881502,14.1197,22.640627,15.947425,...,-0.001012,0.001218,0.159562,4.580153,2.33463,0.293333,0.200222,0.111667,0.076902,-17.938622
2,37.71248,0.055386,35.20089,37.791447,39.904694,4.703804,69.88926,43.19079,16.530645,13.809689,...,-0.014329,-0.003311,0.141658,3.703704,2.717392,0.216,0.113947,0.108333,0.080087,-17.706347
3,41.00152,0.070778,38.991745,41.111237,43.52884,4.537094,152.84485,105.83014,17.925686,6.202608,...,0.006954,0.005658,0.151966,5.058366,1.984127,0.338,0.474822,0.12,0.071414,-16.72362
4,36.614265,0.027685,35.62853,36.464733,37.71804,2.089512,23.40975,16.36406,12.33885,10.502349,...,-0.00306,-0.000126,0.070482,4.830918,1.980198,0.315,0.163936,0.134,0.090907,-26.597572


In [6]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

In [7]:
model_rf = RandomForestClassifier(random_state=42)
rfecv_rf  = RFECV(estimator= model_rf, cv= Strat_kfold, scoring="accuracy", n_jobs=-1) # Instantiate the RFECV and its parameters, n_jobs=-1 means that use all the processors
model_rf_fit = rfecv_rf.fit(X, y)
print("Optimal number of features for Random Forest Classifier: %d" % model_rf_fit.n_features_)

Optimal number of features for Random Forest Classifier: 3


In [8]:
model_rf = RandomForestClassifier(random_state=42)
rfe_rf = RFE(estimator=model_rf, n_features_to_select=rfecv_rf.n_features_)
model_rf_fit_rfe = rfe_rf.fit(X, y)

In [9]:
feature_ranking_RF = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_rf_fit_rfe.ranking_
})
feature_ranking = feature_ranking_RF.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_rf.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
81,loudnessPeaksPerSec,1
85,MeanUnvoicedSegmentLength,1


In [10]:
model_dt = DecisionTreeClassifier(random_state=42)
rfecv_dt  = RFECV(estimator= model_dt, cv= Strat_kfold, scoring="accuracy", n_jobs=-1)
model_dt_fit = rfecv_dt.fit(X, y)
print("Optimal number of features for Decision Tree Classifier: %d" % rfecv_dt.n_features_)

Optimal number of features for Decision Tree Classifier: 2


In [11]:
model_dt = DecisionTreeClassifier(random_state=42)
rfe_dt = RFE(estimator=model_dt, n_features_to_select=rfecv_dt.n_features_)
model_dt_fit_rfe = rfe_dt.fit(X, y)

In [12]:
feature_ranking_dt = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_dt_fit_rfe.ranking_
})
feature_ranking = feature_ranking_dt.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_dt.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
85,MeanUnvoicedSegmentLength,1


In [13]:
model_gbc = GradientBoostingClassifier(random_state=42)
rfecv_gbc  = RFECV(estimator= model_gbc, cv= Strat_kfold, scoring="accuracy", n_jobs=-1)
model_gbc_fit = rfecv_gbc.fit(X, y)
print("Optimal number of features for Gradient Boosting Classifier: %d" % rfecv_gbc.n_features_)

Optimal number of features for Gradient Boosting Classifier: 2


In [14]:
model_gbc = GradientBoostingClassifier(random_state=42)
rfe_gbc = RFE(estimator=model_gbc, n_features_to_select=rfecv_gbc.n_features_)
model_gbc_fit_rfe = rfe_gbc.fit(X, y)

In [15]:
feature_ranking_gbc = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_gbc_fit_rfe.ranking_
})
feature_ranking = feature_ranking_gbc.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_gbc.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
85,MeanUnvoicedSegmentLength,1


In [16]:
model_etc = ExtraTreesClassifier(random_state=42)
rfecv_etc  = RFECV(estimator= model_etc, cv= Strat_kfold, scoring="accuracy", n_jobs=-1)
model_etc_fit = rfecv_etc.fit(X, y)
print("Optimal number of features for Gradient Boosting Classifier: %d" % rfecv_etc.n_features_)

Optimal number of features for Gradient Boosting Classifier: 3


In [17]:
model_etc = ExtraTreesClassifier(random_state=42)
rfe_etc = RFE(estimator=model_gbc, n_features_to_select=rfecv_etc.n_features_)
model_etc_fit_rfe = rfe_etc.fit(X, y)

In [18]:
feature_ranking_etc = pd.DataFrame({
    'Feature': X.columns,
    'Rank': model_etc_fit_rfe.ranking_
})
feature_ranking = feature_ranking_gbc.sort_values(by='Rank', ascending=True)
feature_ranking.head(rfecv_etc.n_features_)

Unnamed: 0,Feature,Rank
86,StddevUnvoicedSegmentLength,1
85,MeanUnvoicedSegmentLength,1
79,slopeUV500-1500_sma3nz_amean,2
