<a href="https://www.kaggle.com/code/olgatasenko/acoustic-features-statistics?scriptVersionId=167847079" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install opensmile

Collecting opensmile
  Downloading opensmile-2.5.0-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.1.0-py3-none-any.whl.metadata (4.2 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.1.2-py3-none-any.whl.metadata (4.6 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.4.0-py3-none-any.whl.metadata (4.9 kB)
Collecting audmath>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audmath-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting audresample<2.0.0,>=1.1.0 (from audinterface>=0.7.0->opensmile)
  Downloading audresample-1.3.3-py3-none-manylinux_2_17_x86_64.whl.metadata (4.4 kB)
Collecting audeer>=1.18.0 (from audobject>=0.6.1->opensmile)
  Downloading audeer-2.0.0-py3-none-an

In [2]:
import os
import time

import numpy as np
import pandas as pd

import audiofile
import opensmile
from pathlib import Path
from scipy. stats import ttest_ind, mannwhitneyu
from numpy.random import seed
from numpy.random import randn
from scipy.stats import shapiro
from statistics import mean 

In [3]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)
features_names = smile.feature_names

In [4]:
def get_features(file_path):
    signal, sampling_rate = audiofile.read(file_path, always_2d=True)
    result = smile.process_signal(signal, sampling_rate)
    return list(result.iloc[0])

In [5]:
p = Path("/kaggle/input/control-preprocessed")
control_features = []

for x in p.iterdir():
    result = get_features(str(x)) 
    control_features.append(result)

In [6]:
p = Path("/kaggle/input/patients-prepocessed")
patients_features = []

for x in p.iterdir():
    result = get_features(str(x)) 
    patients_features.append(result)

In [7]:
control_df = pd.DataFrame(control_features, columns = features_names, dtype = float)

patient_df = pd.DataFrame(patients_features, columns = features_names, dtype = float)

In [8]:
patient_df = patient_df[['F0semitoneFrom27.5Hz_sma3nz_amean','F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
    'loudness_sma3_amean','loudness_sma3_stddevNorm', 'spectralFlux_sma3_amean',
    'spectralFlux_sma3_stddevNorm', 'mfcc1_sma3_amean', 'mfcc1_sma3_stddevNorm', 'mfcc2_sma3_amean', 
    'mfcc2_sma3_stddevNorm','mfcc3_sma3_amean', 'mfcc3_sma3_stddevNorm', 'jitterLocal_sma3nz_amean',
    'jitterLocal_sma3nz_stddevNorm', 'shimmerLocaldB_sma3nz_amean','shimmerLocaldB_sma3nz_stddevNorm',
    'F1frequency_sma3nz_amean','F1frequency_sma3nz_stddevNorm', 'F1bandwidth_sma3nz_amean',
    'F1bandwidth_sma3nz_stddevNorm', 'F2frequency_sma3nz_amean','F2frequency_sma3nz_stddevNorm', 
    'F2bandwidth_sma3nz_amean','F2bandwidth_sma3nz_stddevNorm', 'F3frequency_sma3nz_amean',
    'F3frequency_sma3nz_stddevNorm', 'F3bandwidth_sma3nz_amean','F3bandwidth_sma3nz_stddevNorm']]

control_df = control_df[['F0semitoneFrom27.5Hz_sma3nz_amean','F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
    'loudness_sma3_amean','loudness_sma3_stddevNorm', 'spectralFlux_sma3_amean',
    'spectralFlux_sma3_stddevNorm', 'mfcc1_sma3_amean', 'mfcc1_sma3_stddevNorm', 'mfcc2_sma3_amean', 
    'mfcc2_sma3_stddevNorm','mfcc3_sma3_amean', 'mfcc3_sma3_stddevNorm', 'jitterLocal_sma3nz_amean',
    'jitterLocal_sma3nz_stddevNorm', 'shimmerLocaldB_sma3nz_amean','shimmerLocaldB_sma3nz_stddevNorm',
    'F1frequency_sma3nz_amean','F1frequency_sma3nz_stddevNorm', 'F1bandwidth_sma3nz_amean',
    'F1bandwidth_sma3nz_stddevNorm', 'F2frequency_sma3nz_amean','F2frequency_sma3nz_stddevNorm', 
    'F2bandwidth_sma3nz_amean','F2bandwidth_sma3nz_stddevNorm', 'F3frequency_sma3nz_amean',
    'F3frequency_sma3nz_stddevNorm', 'F3bandwidth_sma3nz_amean','F3bandwidth_sma3nz_stddevNorm']]

In [9]:
def get_shapiro(df):
    for column in control_df.columns:
        p = list(shapiro(df[column]))[1]
        if p < 0.05:
            distr = 'abnormal distribution'
        else:
            distr = 'normal distribution'
        print(f'feature-{column}, {shapiro(df[column])}, {distr}')

In [10]:
print("Control group")
get_shapiro(control_df)

Control group
feature-F0semitoneFrom27.5Hz_sma3nz_amean, ShapiroResult(statistic=0.8793644905090332, pvalue=6.682593367557388e-10), abnormal distribution
feature-F0semitoneFrom27.5Hz_sma3nz_stddevNorm, ShapiroResult(statistic=0.9666851758956909, pvalue=0.0008425374398939312), abnormal distribution
feature-loudness_sma3_amean, ShapiroResult(statistic=0.9652227163314819, pvalue=0.0006011640070937574), abnormal distribution
feature-loudness_sma3_stddevNorm, ShapiroResult(statistic=0.9895291924476624, pvalue=0.3042009770870209), normal distribution
feature-spectralFlux_sma3_amean, ShapiroResult(statistic=0.9556195139884949, pvalue=7.470006676157936e-05), abnormal distribution
feature-spectralFlux_sma3_stddevNorm, ShapiroResult(statistic=0.9777666330337524, pvalue=0.013128820806741714), abnormal distribution
feature-mfcc1_sma3_amean, ShapiroResult(statistic=0.991223156452179, pvalue=0.45557141304016113), normal distribution
feature-mfcc1_sma3_stddevNorm, ShapiroResult(statistic=0.9479124546

In [11]:
print("Patient group")
get_shapiro(patient_df)

Patient group
feature-F0semitoneFrom27.5Hz_sma3nz_amean, ShapiroResult(statistic=0.8216565251350403, pvalue=1.8168646925484366e-12), abnormal distribution
feature-F0semitoneFrom27.5Hz_sma3nz_stddevNorm, ShapiroResult(statistic=0.8727807402610779, pvalue=3.1324323690462563e-10), abnormal distribution
feature-loudness_sma3_amean, ShapiroResult(statistic=0.9283051490783691, pvalue=5.270752012620505e-07), abnormal distribution
feature-loudness_sma3_stddevNorm, ShapiroResult(statistic=0.9806365370750427, pvalue=0.02816202864050865), abnormal distribution
feature-spectralFlux_sma3_amean, ShapiroResult(statistic=0.9821790456771851, pvalue=0.042715828865766525), abnormal distribution
feature-spectralFlux_sma3_stddevNorm, ShapiroResult(statistic=0.9902067184448242, pvalue=0.35925811529159546), normal distribution
feature-mfcc1_sma3_amean, ShapiroResult(statistic=0.9898251295089722, pvalue=0.32735055685043335), normal distribution
feature-mfcc1_sma3_stddevNorm, ShapiroResult(statistic=0.91074585

# Независимый двухвыборочный t-тест для нормального распределения

In [12]:
def t_test(name, col_name):
    p = list(ttest_ind(patient_df[col_name], control_df[col_name]))[1]
    if p < 0.05:
        diff = 'mean is different'
    else:
        diff = 'no differences'
             
    print(f'{name}: {ttest_ind(patient_df[col_name], control_df[col_name])}, {diff}')

In [13]:
t_test('mfcc1_mean', 'mfcc1_sma3_amean')
t_test('F2frequency_mean', 'F2frequency_sma3nz_amean')
t_test('F2frequency_sd', 'F2frequency_sma3nz_stddevNorm')
t_test('F2bandwidth_sd', 'F2bandwidth_sma3nz_stddevNorm')
t_test('F3frequency_mean', 'F3frequency_sma3nz_amean')
t_test('F3bandwidth_mean', 'F3bandwidth_sma3nz_amean')

mfcc1_mean: TtestResult(statistic=-3.801112872702837, pvalue=0.00017362142128705596, df=308.0), mean is different
F2frequency_mean: TtestResult(statistic=3.8698244490715417, pvalue=0.00013296862154480368, df=308.0), mean is different
F2frequency_sd: TtestResult(statistic=-1.3399271785406877, pvalue=0.1812567245501101, df=308.0), no differences
F2bandwidth_sd: TtestResult(statistic=1.7082996661609482, pvalue=0.08858826323877239, df=308.0), no differences
F3frequency_mean: TtestResult(statistic=5.268842666745581, pvalue=2.5850299201933537e-07, df=308.0), mean is different
F3bandwidth_mean: TtestResult(statistic=-2.31341206130135, pvalue=0.02135831993949111, df=308.0), mean is different


Поскольку p-значение меньше 0,05, мы отклоняем нулевую гипотезу t-критерия и делаем вывод, что имеется достаточно доказательств того, что два метода приводят к разным средним баллам за экзамен.

# U-тест Манна-Уитни для ненормального распределения

In [14]:
control_new = control_df.drop(columns=['mfcc1_sma3_amean', 'F2frequency_sma3nz_amean', 
                            'F2frequency_sma3nz_stddevNorm', 'F2bandwidth_sma3nz_stddevNorm',
                            'F3frequency_sma3nz_amean', 'F3bandwidth_sma3nz_amean'])

patients_new = patient_df.drop(columns=['mfcc1_sma3_amean', 'F2frequency_sma3nz_amean', 
                            'F2frequency_sma3nz_stddevNorm', 'F2bandwidth_sma3nz_stddevNorm',
                            'F3frequency_sma3nz_amean', 'F3bandwidth_sma3nz_amean'])

In [15]:
control_new.to_csv('control_new.csv')
patients_new.to_csv('patients_new.csv')

In [16]:
def u_test(col_name):
    p = list(mannwhitneyu(patient_df[col_name], control_df[col_name]))[1]
    if p < 0.05:
        diff = 'mean is different'
    else:
        diff = 'no differences'
             
    print(f'{col_name}: {mannwhitneyu(patient_df[col_name], control_df[col_name])}, {diff}')

In [17]:
for column in control_new.columns:
    u_test(column)

F0semitoneFrom27.5Hz_sma3nz_amean: MannwhitneyuResult(statistic=14845.0, pvalue=0.00033196440226642055), mean is different
F0semitoneFrom27.5Hz_sma3nz_stddevNorm: MannwhitneyuResult(statistic=6864.0, pvalue=6.843396336087512e-11), mean is different
loudness_sma3_amean: MannwhitneyuResult(statistic=8710.0, pvalue=2.8564627601884714e-05), mean is different
loudness_sma3_stddevNorm: MannwhitneyuResult(statistic=19396.0, pvalue=8.245240847394584e-21), mean is different
spectralFlux_sma3_amean: MannwhitneyuResult(statistic=10227.0, pvalue=0.023689568358288258), mean is different
spectralFlux_sma3_stddevNorm: MannwhitneyuResult(statistic=17521.0, pvalue=2.94598470467322e-12), mean is different
mfcc1_sma3_stddevNorm: MannwhitneyuResult(statistic=17332.0, pvalue=1.5756270598083485e-11), mean is different
mfcc2_sma3_amean: MannwhitneyuResult(statistic=15612.0, pvalue=5.090929565030815e-06), mean is different
mfcc2_sma3_stddevNorm: MannwhitneyuResult(statistic=11980.0, pvalue=0.9676518279842155)

# Some 

In [18]:
for column in control_df.columns:
    print(f"""Control:{column}, max={max(control_df[column])}, min={min(control_df[column])}, mean={mean(control_df[column])}""")

Control:F0semitoneFrom27.5Hz_sma3nz_amean, max=39.55133056640625, min=22.928234100341797, mean=33.47360676180932
Control:F0semitoneFrom27.5Hz_sma3nz_stddevNorm, max=0.38527029752731323, min=0.1363452970981598, mean=0.22726975121805745
Control:loudness_sma3_amean, max=1.8605456352233887, min=0.4378761053085327, mean=0.9920047342777252
Control:loudness_sma3_stddevNorm, max=1.2191081047058105, min=0.5916155576705933, mean=0.853443782560287
Control:spectralFlux_sma3_amean, max=2.15478253364563, min=0.2584802210330963, mean=0.917884825506518
Control:spectralFlux_sma3_stddevNorm, max=1.4397013187408447, min=0.7677875757217407, mean=1.095979489818696
Control:mfcc1_sma3_amean, max=32.53364562988281, min=12.12855052947998, mean=22.04869991425545
Control:mfcc1_sma3_stddevNorm, max=1.7172499895095825, min=0.5512177348136902, mean=0.9651531050282139
Control:mfcc2_sma3_amean, max=26.535226821899414, min=-12.596908569335938, mean=7.127539436951761
Control:mfcc2_sma3_stddevNorm, max=69.19572448730469

In [19]:
for column in patient_df.columns:
    print(f"""Patient:{column}, max={max(patient_df[column])}, min={min(patient_df[column])}, mean={mean(patient_df[column])}""")

Patient:F0semitoneFrom27.5Hz_sma3nz_amean, max=41.8543701171875, min=20.99978256225586, mean=34.50545594000047
Patient:F0semitoneFrom27.5Hz_sma3nz_stddevNorm, max=0.3797951638698578, min=0.08663035929203033, mean=0.19465882393621628
Patient:loudness_sma3_amean, max=1.9355812072753906, min=0.34227150678634644, mean=0.8458803697939842
Patient:loudness_sma3_stddevNorm, max=1.4434866905212402, min=0.6841607689857483, mean=1.0171214918936453
Patient:spectralFlux_sma3_amean, max=1.513218641281128, min=0.26061683893203735, mean=0.7844948205255693
Patient:spectralFlux_sma3_stddevNorm, max=1.811147689819336, min=0.779425859451294, mean=1.2324210636077388
Patient:mfcc1_sma3_amean, max=35.336700439453125, min=9.755379676818848, mean=20.19762268066406
Patient:mfcc1_sma3_stddevNorm, max=2.421015977859497, min=0.6279579401016235, mean=1.1836319046635781
Patient:mfcc2_sma3_amean, max=22.119691848754883, min=-0.8351825475692749, mean=11.150012541586353
Patient:mfcc2_sma3_stddevNorm, max=39.16627883911