# eGeMAPSv02特征处理

折中的特征选择方案：对特征进行增强后，去除方差小于0.01的特征
最终数据矩阵 420×52, 

后续实验发现：重要性方面的特征中反而方差不大的几个特征的重要性很高。故选择去除方差小于0.001的特征，而小于0.001的特征其实就两个，而这两个的方差都为0。所以这一步处理与去除0方差的特征无异。

In [1]:
import opensmile
import os
from tqdm import tqdm
import pandas as pd
import librosa
import numpy as np
from sklearn.preprocessing import MinMaxScaler

smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

df = pd.read_csv("../data/self_control.csv")
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

X = shuffled_df['name']
y = shuffled_df['class']
self_folder = "../data/self_control/"

使用self control和group_control测试后（不进行数据增强），原始特征矩阵为60×89
发现都含有60个eGeMAPSv02特征为全0，或者方差为0。
但是经过数据增强后，方差为0的特征只有2个


折中方案：保留方差大于0.01的特征  最后特征矩阵为420×52，包括一列label

In [2]:
def enhance_data(file):
    data, sampling_rate = librosa.load(file, sr=None) 
    noises = data + 0.05 * np.random.randn(len(data)) # 向音频数据中添加不同强度的随机噪声
    pitches = librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=2) # 改变音频的音高
    stretches = librosa.effects.time_stretch(data, rate=2) # 改变音频的播放速度
    volumes = data * 2 # 调整音频的音量
    # 将音频数据从中间切割为两部分
    mid_index = len(data) // 2
    cut1 = data[:mid_index]
    cut2 = data[mid_index:]

    group = [data,noises,pitches,stretches,volumes,cut1,cut2]
    return group, sampling_rate

def process_group(group, sampling_rate, label):
    enhance = []
    for g in group:
        features_eGeMAPS = smile.process_signal(g, sampling_rate)
        
        features_eGeMAPS['label'] = label
        enhance.append(features_eGeMAPS)
    return enhance


features_lst = []
for index in tqdm(range(len(X))) :
    filename = X.iloc[index]
    label = y.iloc[index]
    for root, dirs, files in os.walk(self_folder):
        if filename in files:
            full_path = os.path.join(root, filename)
            group, sampling_rate = enhance_data(full_path)
            # print(sampling_rate)
            features_eGeMAPS = process_group(group, sampling_rate, label)
            features_lst.extend(features_eGeMAPS)



100%|██████████| 60/60 [10:19<00:00, 10.33s/it]


## minmax归一化

In [9]:
features_df = pd.concat(features_lst).reset_index(drop=True)

# 最大最小归一化
features = features_df.drop(columns=['label'])
minmax_transfer = MinMaxScaler()
features = minmax_transfer.fit_transform(features)

df_features = pd.DataFrame(features, columns=features_df.columns.tolist()[:-1])
df_features['label'] = features_df['label']

column_variances = df_features.var()

selected_columns = column_variances[column_variances > 0.001].index.tolist()
df_features_filtered = df_features[selected_columns]
len(selected_columns)


87

## 输出过滤的特征

In [7]:
filter_f = df_features.loc[:, (df_features.var() <= 0.001)]
filter_f.columns.tolist()

with open("../result/01preprocess/varLessthan0.001.txt",'w')as f:
    for i in filter_f:
        f.write(f"{i}\n")

## 特征名称映射

In [10]:
# original_feature_names = df_features_filtered.columns
# print(original_feature_names) # shape=52

# features_name_mapping = {
#  'loudness_sma3_amean' :'ld-sma3(amean)',
#  'loudness_sma3_stddevNorm':'ld-sma3(stddevNorm)',
#  'loudness_sma3_percentile20.0':'ld-sma3(pctl20)',
#  'loudness_sma3_percentile50.0':'ld-sma3(pctl50)',
#  'loudness_sma3_percentile80.0':'ld-sma3(pctl80)',
#  'loudness_sma3_pctlrange0-2':'ld-sma3(pctl0-2)',
#  'loudness_sma3_meanRisingSlope':'ld-sma3(meanRS)',
#  'loudness_sma3_stddevRisingSlope':'ld-sma3(stdRS)',
#  'loudness_sma3_meanFallingSlope':'ld-sma3(meanFS)',
#  'loudness_sma3_stddevFallingSlope':'ld-sma3(stdFS)',
#  'spectralFlux_sma3_amean':'sf-sma3(amean)',
#  'spectralFlux_sma3_stddevNorm':'sf-sma3(stdNorm)',
#  'mfcc1_sma3_amean':'mfcc1-sma3(amean)',
#  'mfcc1_sma3_stddevNorm':'mfcc1-sma3(stdNorm)',
#  'mfcc2_sma3_amean':'mfcc2-sma3(amean)',
#  'mfcc2_sma3_stddevNorm':'mfcc2-sma3(stdNorm)',
#  'mfcc3_sma3_amean':'mfcc3-sma3(amean)',
#  'mfcc3_sma3_stddevNorm':'mfcc3-sma3(stdNorm)',
#  'mfcc4_sma3_amean':'mfcc4-sma3(amean)',
#  'mfcc4_sma3_stddevNorm':'mfcc4-sma3(stdNorm)',
#  'alphaRatioUV_sma3nz_amean':'ar-sma3nz(amean)',
#  'hammarbergIndexUV_sma3nz_amean':'hi-sma3nz(amean)',
#  'slopeUV0-500_sma3nz_amean':'sl-sma3nz(0-500)(amean)',
#  'slopeUV500-1500_sma3nz_amean':'sl-sma3nz(500-1500)(amean)',
#  'spectralFluxUV_sma3nz_amean':'sf-sma3nz(amean)',
#  'loudnessPeaksPerSec':'loudpeaks(persec)',
#  'MeanUnvoicedSegmentLength':'MUSL(sec)',
#  'equivalentSoundLevel_dBp':'esl(dBp)',
# }
# modified_feature_names = [
#     features_name_mapping.get(name, name)  # 如果有映射则使用映射，否则保留原名
#     for name in original_feature_names
# ]

# df_features_filtered.columns = modified_feature_names
df_features_filtered.to_csv('../result/01preprocess/01features_eGeMAPS_minmax_drop0.01var.csv', index=False)

# 备用代码

In [None]:
# features_lst = []
# for index in tqdm(range(len(X))) :
#     filename = X.iloc[index]
#     label = y.iloc[index]
#     for root, dirs, files in os.walk(self_folder):
#         if filename in files:
#             full_path = os.path.join(root, filename)
#             features_eGeMAPS = smile.process_file(full_path)
#             features_eGeMAPS['label'] = label
#             features_lst.append(features_eGeMAPS)
