In [1]:
#r "nuget: MathNet.Numerics"
#r "nuget: Microsoft.Data.Analysis"
#r "nuget: NAudio"
#r "nuget: NumSharp"
#r "nuget: NWaves"
#r "nuget: Plotly.NET"
#r "nuget: Plotly.NET.CSharp"
#r "nuget: Microsoft.ML"

Loading extensions from `C:\Users\Denis\.nuget\packages\microsoft.data.analysis\0.23.0\interactive-extensions\dotnet\Microsoft.Data.Analysis.Interactive.dll`

In [3]:
using System;
using System.IO;
using System.Linq;
using System.Globalization;
using System.Collections.Generic;
using Microsoft.Data.Analysis;
using NAudio.Wave;
using NWaves.Signals;
using NWaves.Transforms;
using NWaves.FeatureExtractors;
using NWaves.FeatureExtractors.Options;
using Plotly.NET;

using Microsoft.FSharp.Core;
using Chart = Plotly.NET.CSharp.Chart;

In [2]:
string baseRoot = Directory.GetCurrentDirectory();
string dataRoot = Path.Combine(baseRoot, "data", "edinbourg");

// папки с аудио
string noisyTrain = Path.Combine(dataRoot, "noisy_trainset_28spk_wav");
string noisyTest  = Path.Combine(dataRoot, "noisy_testset_wav");
string cleanTrain = Path.Combine(dataRoot, "clean_trainset_28spk_wav");
string cleanTest  = Path.Combine(dataRoot, "clean_testset_wav");

// папки с расшифровками
string trainTxtFolder = Path.Combine(dataRoot, "trainset_28spk_txt");
string testTxtFolder  = Path.Combine(dataRoot, "testset_txt");

// лог-файлы
string logFolder = Path.Combine(dataRoot, "logfiles");
string logTrain = Path.Combine(logFolder, "log_trainset_28spk.txt");
string logTest  = Path.Combine(logFolder, "log_testset.txt");

# Подготовка и чтение данных

Для обучения модели буду использовать данные университета Эдинбурга, которые размечены на 
train / test, а также транскрибированы.

На данном этапе мне нужно их считать с диска и создать исходный датафрейм, сохранить его в csv для дальнейшей загрузки

In [None]:
string baseRoot = Directory.GetCurrentDirectory();
string dataRoot = Path.Combine(baseRoot, "data", "edinbourg");

// папки с аудио
string noisyTrain = Path.Combine(dataRoot, "noisy_trainset_28spk_wav");
string noisyTest  = Path.Combine(dataRoot, "noisy_testset_wav");
string cleanTrain = Path.Combine(dataRoot, "clean_trainset_28spk_wav");
string cleanTest  = Path.Combine(dataRoot, "clean_testset_wav");

// папки с расшифровками
string trainTxtFolder = Path.Combine(dataRoot, "trainset_28spk_txt");
string testTxtFolder  = Path.Combine(dataRoot, "testset_txt");

// лог-файлы
string logFolder = Path.Combine(dataRoot, "logfiles");
string logTrain = Path.Combine(logFolder, "log_trainset_28spk.txt");
string logTest  = Path.Combine(logFolder, "log_testset.txt");

In [None]:
Dictionary<string, (string noise, double? snr)> ParseLogFile(string logPath)
{
    var dict = new Dictionary<string, (string, double?)>(StringComparer.OrdinalIgnoreCase);

    if (!File.Exists(logPath)) 
        return dict;

    foreach (var line in File.ReadLines(logPath))
    {
        var s = line.Trim();
        if (string.IsNullOrWhiteSpace(s)) 
            continue;
        
        // ожидается формат: "<id> <noise> <snr>"
        // пример: "p257_001 bus 1.750000e+01"
        var parts = s.Split(new[]{' ', '\t'}, StringSplitOptions.RemoveEmptyEntries);
        
        if (parts.Length < 2) 
            continue;
        
        var id = parts[0];
        string noise = parts.Length >= 2 ? parts[1] : "unknown";
        double? snr = null;
        
        if (parts.Length >= 3)
        {
            if (double.TryParse(parts[2], NumberStyles.Float, CultureInfo.InvariantCulture, out double val))
                snr = val;
        }
        dict[id] = (noise, snr);
    }
    return dict;
}

In [None]:
var trainLog = ParseLogFile(logTrain);
var testLog  = ParseLogFile(logTest);

trainLog

key,value
,
,
,
,
,
,
,
,
,
,

Unnamed: 0,Unnamed: 1
Item1,babble
Item2,15

Unnamed: 0,Unnamed: 1
Item1,babble
Item2,10

Unnamed: 0,Unnamed: 1
Item1,babble
Item2,5

Unnamed: 0,Unnamed: 1
Item1,babble
Item2,0

Unnamed: 0,Unnamed: 1
Item1,cafeteria
Item2,15

Unnamed: 0,Unnamed: 1
Item1,cafeteria
Item2,10

Unnamed: 0,Unnamed: 1
Item1,cafeteria
Item2,5

Unnamed: 0,Unnamed: 1
Item1,cafeteria
Item2,0

Unnamed: 0,Unnamed: 1
Item1,car
Item2,15

Unnamed: 0,Unnamed: 1
Item1,car
Item2,10

Unnamed: 0,Unnamed: 1
Item1,car
Item2,5

Unnamed: 0,Unnamed: 1
Item1,car
Item2,0

Unnamed: 0,Unnamed: 1
Item1,kitchen
Item2,15

Unnamed: 0,Unnamed: 1
Item1,kitchen
Item2,10

Unnamed: 0,Unnamed: 1
Item1,kitchen
Item2,5

Unnamed: 0,Unnamed: 1
Item1,kitchen
Item2,0

Unnamed: 0,Unnamed: 1
Item1,meeting
Item2,15

Unnamed: 0,Unnamed: 1
Item1,meeting
Item2,10

Unnamed: 0,Unnamed: 1
Item1,meeting
Item2,5

Unnamed: 0,Unnamed: 1
Item1,meeting
Item2,0


In [None]:
enum SampleType
{
    Test,
    Train
}

struct DatasetFile
{
    public string filePath;
    public string setName;
    public SampleType sampleType;

    public DatasetFile(string filePath, string setName, SampleType sampleType)
    {
        this.filePath = filePath;
        this.setName = setName;
        this.sampleType = sampleType;
    }

    public override string ToString(){
        return $"{this.filePath} ({this.sampleType})";
    }
}

In [None]:
List<DatasetFile> EnumerateDatasetFiles()
{
    var datasetFiles = new List<DatasetFile>();
    
    if (Directory.Exists(noisyTrain)) 
        datasetFiles.AddRange(Directory.EnumerateFiles(noisyTrain, "*.wav", SearchOption.AllDirectories)
            .Select(p => new DatasetFile(p, "noisy", SampleType.Train)));
    
    if (Directory.Exists(noisyTest))  
        datasetFiles.AddRange(Directory.EnumerateFiles(noisyTest, "*.wav", SearchOption.AllDirectories)
            .Select(p => new DatasetFile(p, "noisy", SampleType.Test)));
    
    if (Directory.Exists(cleanTrain)) 
        datasetFiles.AddRange(Directory.EnumerateFiles(cleanTrain, "*.wav", SearchOption.AllDirectories)
            .Select(p => new DatasetFile(p, "clean", SampleType.Train)));
    
    if (Directory.Exists(cleanTest))  
        datasetFiles.AddRange(Directory.EnumerateFiles(cleanTest, "*.wav", SearchOption.AllDirectories)
            .Select(p => new DatasetFile(p, "clean", SampleType.Test)));

    return datasetFiles;
}

In [None]:
var files = EnumerateDatasetFiles();
Console.WriteLine($"Found {files.Count} audio files.");

Found 24792 audio files.


In [None]:
files[0]

Unnamed: 0,Unnamed: 1
filePath,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_001.wav
setName,noisy
sampleType,Train


In [None]:
var ids = new List<string>(files.Count);
var filenames = new List<string>(files.Count);
var sets = new List<string>(files.Count);
var kinds = new List<SampleType>(files.Count);       // noisy / clean
var paths = new List<string>(files.Count);
var noiseTypes = new List<string>(files.Count);
var snrVals = new List<double?>(files.Count);
var filesizeBytes = new List<long?>(files.Count);

In [None]:
string IdFromPath(string p) => Path.GetFileNameWithoutExtension(p);

In [None]:
foreach (var datasetFile in files)
{
    var id = IdFromPath(datasetFile.filePath);
    ids.Add(id);
    filenames.Add(Path.GetFileName(datasetFile.filePath));
    sets.Add(datasetFile.setName);
    kinds.Add(datasetFile.sampleType);
    paths.Add(datasetFile.filePath);
    filesizeBytes.Add(new FileInfo(datasetFile.filePath).Length);

    // лог: если noisy -> смотрим в соответствующем логе (trainLog или testLog)
    (string noise, double? snr) meta = ("clean", null);
    if (datasetFile.setName.Equals("noisy", StringComparison.OrdinalIgnoreCase))
    {
        if (datasetFile.sampleType == SampleType.Train && trainLog.TryGetValue(id, out var tmeta)) 
            meta = tmeta;
        else if (datasetFile.sampleType == SampleType.Test && testLog.TryGetValue(id, out var tmeta2)) 
            meta = tmeta2;
        else
            meta = ("unknown", null);
    }

    else
    {
        // clean files — noise = "clean", snr = null
        meta = ("clean", null);
    }
    noiseTypes.Add(meta.noise);
    snrVals.Add(meta.snr);
}

In [None]:
var df = new DataFrame();
df.Columns.Add(DataFrameColumn.Create("id", ids));
df.Columns.Add(DataFrameColumn.Create("filename", filenames));
df.Columns.Add(DataFrameColumn.Create("set", sets));
df.Columns.Add(DataFrameColumn.Create("kind", kinds));
df.Columns.Add(DataFrameColumn.Create("path", paths));
df.Columns.Add(DataFrameColumn.Create("noise_type", noiseTypes));
df.Columns.Add(DataFrameColumn.Create("snr_db", snrVals));
df.Columns.Add(DataFrameColumn.Create("filesize_bytes", filesizeBytes));

In [None]:
df.Head(5)

index,id,filename,set,kind,path,noise_type,snr_db,filesize_bytes
0,p226_001,p226_001.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_001.wav,babble,15,218924
1,p226_002,p226_002.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_002.wav,babble,10,374444
2,p226_003,p226_003.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_003.wav,babble,5,746378
3,p226_004,p226_004.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_004.wav,babble,0,498284
4,p226_005,p226_005.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_005.wav,cafeteria,15,712456


In [None]:
string outCsv = Path.Combine(dataRoot, "indexed_metadata.csv");
DataFrame.SaveCsv(df, outCsv, ',',
    header: true,
    encoding: System.Text.Encoding.UTF8,
    cultureInfo: CultureInfo.InvariantCulture);

# Feature extraction 
Из полученных данных нужно сформировать фичи

In [None]:
string indexedCsvPath = Path.Combine(dataRoot, "indexed_metadata.csv");
var df = DataFrame.LoadCsv(indexedCsvPath, separator: ',', header: true);

In [None]:
DataFrame grouped = df.GroupBy("set").Count();
Console.WriteLine(grouped);

set            id             filename       kind           path           noise_type     snr_db         filesize_bytes 
noisy          12396          12396          12396          12396          12396          12396          12396          
clean          12396          12396          12396          12396          12396          0              12396          



In [None]:
int nrows = (int)df.Rows.Count;
Console.WriteLine($"Loaded indexed metadata: {nrows} rows");

Loaded indexed metadata: 24792 rows


In [None]:
// STFT / MFCC параметры
const int fftSize = 512;      // FFT size (power of two)
const int hopSize = 256;      // hop (overlap = fftSize - hopSize)
const int mfccCount = 13;     // number of MFCC coefficients
const int filterBankSize = 26;
const double preEmphasis = 0.97;
const double clipThreshold = 0.999;  // clip detection threshold

In [None]:
var rmsCol = new DoubleDataFrameColumn("rms", nrows);
var peakCol = new DoubleDataFrameColumn("peak", nrows);
var clipFracCol = new DoubleDataFrameColumn("clip_fraction", nrows);
var zcrCol = new DoubleDataFrameColumn("zero_crossing_rate", nrows);
var durationCol = new DoubleDataFrameColumn("duration_s", nrows);
var sampleRateCol = new PrimitiveDataFrameColumn<int>("sample_rate", nrows);
var channelsCol = new PrimitiveDataFrameColumn<int>("channels", nrows);
var specCentCol = new DoubleDataFrameColumn("spectral_centroid", nrows);
var specBwCol = new DoubleDataFrameColumn("spectral_bandwidth", nrows);
var specRolloffCol = new DoubleDataFrameColumn("spectral_rolloff", nrows);
var specFlatCol = new DoubleDataFrameColumn("spectral_flatness", nrows);

var mfccMeanCols = new List<DoubleDataFrameColumn>();
var mfccStdCols = new List<DoubleDataFrameColumn>();
for (int k = 0; k < mfccCount; k++)
{
    mfccMeanCols.Add(new DoubleDataFrameColumn($"mfcc_mean_{k+1}", nrows));
    mfccStdCols.Add(new DoubleDataFrameColumn($"mfcc_std_{k+1}", nrows));
}

df.Columns.Add(durationCol);
df.Columns.Add(sampleRateCol);
df.Columns.Add(channelsCol);
df.Columns.Add(rmsCol);
df.Columns.Add(peakCol);
df.Columns.Add(clipFracCol);
df.Columns.Add(zcrCol);
df.Columns.Add(specCentCol);
df.Columns.Add(specBwCol);
df.Columns.Add(specRolloffCol);
df.Columns.Add(specFlatCol);
foreach (var c in mfccMeanCols) df.Columns.Add(c);
foreach (var c in mfccStdCols) df.Columns.Add(c);

In [None]:
const int barWidth = 40;


In [None]:
double ComputeRms(ReadOnlySpan<float> s)
{
    if (s.Length == 0) return 0.0;
    double sum = 0.0;
    for (int i = 0; i < s.Length; i++) { double v = s[i]; sum += v * v; }
    return Math.Sqrt(sum / s.Length);
}
double ComputePeak(ReadOnlySpan<float> s)
{
    if (s.Length == 0) return 0.0;
    float m = 0f;
    for (int i = 0; i < s.Length; i++) { var a = Math.Abs(s[i]); if (a > m) m = a; }
    return m;
}
double ComputeClipFrac(ReadOnlySpan<float> s, double threshold = 0.999)
{
    if (s.Length == 0) return 0.0;
    int cnt = 0;
    for (int i = 0; i < s.Length; i++) if (Math.Abs(s[i]) >= threshold) cnt++;
    return (double)cnt / s.Length;
}
double ComputeZcr(ReadOnlySpan<float> s)
{
    if (s.Length < 2) return 0.0;
    int z = 0;
    for (int i = 1; i < s.Length; i++) if ((s[i-1] >= 0f) ^ (s[i] >= 0f)) z++;
    return (double)z / s.Length;
}

// spectral features from power spectra (each frame is float[] of magnitudes/power)
void SpectralFrameStats(IList<float[]> spectrogram, int sampleRate, int fftSize, out double meanCentroidHz, out double meanBandwidthHz, out double meanRolloffHz, out double meanFlatness)
{
    if (spectrogram == null || spectrogram.Count == 0)
    {
        meanCentroidHz = meanBandwidthHz = meanRolloffHz = meanFlatness = 0.0;
        return;
    }

    double sumCent = 0.0, sumBw = 0.0, sumRoll = 0.0, sumFlat = 0.0;
    int frames = spectrogram.Count;
    int bins = spectrogram[0].Length; // should be fftSize/2 + 1

    for (int f = 0; f < frames; f++)
    {
        var mag = spectrogram[f];
        double energySum = 1e-12;
        for (int b = 0; b < bins; b++) energySum += mag[b];

        // centroid
        double centroid = 0.0;
        for (int b = 0; b < bins; b++)
        {
            double freq = (double)b * sampleRate / fftSize;
            centroid += freq * mag[b];
        }
        centroid /= energySum;
        sumCent += centroid;

        // bandwidth (std dev around centroid)
        double var = 0.0;
        for (int b = 0; b < bins; b++)
        {
            double freq = (double)b * sampleRate / fftSize;
            var += mag[b] * (freq - centroid) * (freq - centroid);
        }
        var /= energySum;
        sumBw += Math.Sqrt(var);

        // rolloff 85%
        double csum = 0.0;
        double thresh = 0.85 * energySum;
        int idx = 0;
        while (idx < bins && csum < thresh) { csum += mag[idx]; idx++; }
        double roll = (double)idx * sampleRate / fftSize;
        sumRoll += roll;

        // spectral flatness: geometric mean / arithmetic mean of power spectrum (avoid zeros)
        double geoLogSum = 0.0;
        double arithSum = 1e-12;
        for (int b = 0; b < bins; b++)
        {
            double v = Math.Max(1e-12, mag[b]);
            geoLogSum += Math.Log(v);
            arithSum += v;
        }
        double geoMean = Math.Exp(geoLogSum / bins);
        double flatness = geoMean / (arithSum / bins);
        sumFlat += flatness;
    }

    meanCentroidHz = sumCent / frames;
    meanBandwidthHz = sumBw / frames;
    meanRolloffHz = sumRoll / frames;
    meanFlatness = sumFlat / frames;
}

In [None]:
for (int i = 0; i < nrows; i++)
{
    var pathObj = df.Columns["path"][i];
    if (pathObj == null || string.IsNullOrWhiteSpace(pathObj.ToString()))
        continue;

    string path = pathObj.ToString();
    if (!File.Exists(path))
    {
        Console.WriteLine($"Warning: audio file not found: {path}");
        continue;
    }


    try
    {
        // читаем файл через NAudio.AudioFileReader (возвращает float samples in [-1,1])
        using (var afr = new AudioFileReader(path))
        {
            int sr = afr.WaveFormat.SampleRate;
            int channels = afr.WaveFormat.Channels;
            long totalSamplesEstimate = afr.Length / (afr.WaveFormat.BitsPerSample / 8);
            // читаем блоками; аудио читается как interleaved floats (AudioFileReader.Read)
            var sampleList = new List<float>();
            int bufferSize = sr * channels; // ~1 sec buffer
            float[] buffer = new float[bufferSize];
            int read;

            while ((read = afr.Read(buffer, 0, buffer.Length)) > 0)
            {
                if (channels == 1)
                {
                    for (int t = 0; t < read; t++) sampleList.Add(buffer[t]);
                }
                else
                {
                    // mixdown stereo -> mono by averaging channels
                    for (int t = 0; t < read; t += channels)
                    {
                        float acc = 0f;
                        int limit = Math.Min(channels, read - t);
                        for (int c = 0; c < limit; c++) acc += buffer[t + c];
                        sampleList.Add(acc / limit);
                    }
                }
            }

            // теперь у нас массив мономонных samples
            float[] samples = sampleList.ToArray();
            ReadOnlySpan<float> sspan = samples;

            // duration
            double durationSec = samples.Length / (double)sr;
            df.Columns["duration_s"][i] = durationSec;
            df.Columns["sample_rate"][i] = sr;
            df.Columns["channels"][i] = channels;

            // базовые признаки
            double rms = ComputeRms(sspan);
            double peak = ComputePeak(sspan);
            double clipFrac = ComputeClipFrac(sspan, clipThreshold);
            double zcr = ComputeZcr(sspan);

            rmsCol[i] = rms;
            peakCol[i] = peak;
            clipFracCol[i] = clipFrac;
            zcrCol[i] = zcr;

            // STFT / spectrogram (NWaves)
            var signal = new DiscreteSignal(sr, samples);
            var stft = new Stft(fftSize, hopSize);
            // spectrogram: List<float[]> where each float[] is power spectrum (length fftSize/2 + 1)
            var spectrogram = stft.Spectrogram(signal, normalize: false); // normalize:false -> raw power spectra

            // spectral stats
            SpectralFrameStats(spectrogram, sr, fftSize, out double cent, out double bw, out double rolloff, out double flatness);
            specCentCol[i] = cent;
            specBwCol[i] = bw;
            specRolloffCol[i] = rolloff;
            specFlatCol[i] = flatness;

            // MFCC (NWaves)
            var mfccOpts = new MfccOptions
            {
                SamplingRate = sr,
                FeatureCount = mfccCount,
                FrameDuration = (double)fftSize / sr,   // approx frame length in seconds
                HopDuration = (double)hopSize / sr,
                FilterBankSize = filterBankSize,
                PreEmphasis = preEmphasis,
                FftSize = fftSize
            };
            var mfccExtractor = new MfccExtractor(mfccOpts);
            // ComputeFrom(DiscreteSignal) -> IList<float[]> (each frame -> array of mfccCount)
            var mfccVectors = mfccExtractor.ComputeFrom(signal);

            // compute per-coefficient mean/std across frames
            if (mfccVectors != null && mfccVectors.Count > 0)
            {
                int frames = mfccVectors.Count;
                for (int k = 0; k < mfccCount; k++)
                {
                    double sum = 0.0;
                    for (int f = 0; f < frames; f++) sum += mfccVectors[f][k];
                    double mean = sum / frames;
                    double var = 0.0;
                    for (int f = 0; f < frames; f++)
                    {
                        double d = mfccVectors[f][k] - mean;
                        var += d * d;
                    }
                    double std = Math.Sqrt(var / frames);
                    mfccMeanCols[k][i] = mean;
                    mfccStdCols[k][i]  = std;
                }
            }
            else
            {
                // no MFCC frames -> fill zeros
                for (int k = 0; k < mfccCount; k++)
                {
                    mfccMeanCols[k][i] = 0.0;
                    mfccStdCols[k][i] = 0.0;
                }
            }
        } // using afr
    }
    catch (Exception ex)
    {
        Console.WriteLine($"Error processing '{path}': {ex.Message}");
    }

    if (i % 100 == 0)
        Console.WriteLine($"Processed {i}/{nrows} files...");

}

Processed 0/24792 files...
Processed 100/24792 files...
Processed 200/24792 files...
Processed 300/24792 files...
Processed 400/24792 files...
Processed 500/24792 files...
Processed 600/24792 files...
Processed 700/24792 files...
Processed 800/24792 files...
Processed 900/24792 files...
Processed 1000/24792 files...
Processed 1100/24792 files...
Processed 1200/24792 files...
Processed 1300/24792 files...
Processed 1400/24792 files...
Processed 1500/24792 files...
Processed 1600/24792 files...
Processed 1700/24792 files...
Processed 1800/24792 files...
Processed 1900/24792 files...
Processed 2000/24792 files...
Processed 2100/24792 files...
Processed 2200/24792 files...
Processed 2300/24792 files...
Processed 2400/24792 files...
Processed 2500/24792 files...
Processed 2600/24792 files...
Processed 2700/24792 files...
Processed 2800/24792 files...
Processed 2900/24792 files...
Processed 3000/24792 files...
Processed 3100/24792 files...
Processed 3200/24792 files...
Processed 3300/24792 f

In [None]:
// ---------- Сохраняем DataFrame в CSV ----------
string outCsvPath = Path.Combine(dataRoot, "indexed_metadata_with_feats.csv");
DataFrame.SaveCsv(df, outCsvPath, ',', header: true, encoding: System.Text.Encoding.UTF8, cultureInfo: CultureInfo.InvariantCulture);
Console.WriteLine($"Saved extended metadata with features to: {outCsvPath}");

Saved extended metadata with features to: z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\indexed_metadata_with_feats.csv


# EDA
Теперь можно рассмотреть сгенерированные фичи и провести удаление лишних

| Признак                    | Что показывает                                 | Зачем считать                                                                                                                                           |
| -------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `duration_s`               | Длительность аудио в секундах                  | Очень короткие фрагменты (например, <0.3–0.5s) могут быть шумом или неполными словами. Их можно убрать, чтобы модель не обучалась на «шумных» примерах. |
| `rms` (Root Mean Square)   | Среднеквадратичная амплитуда сигнала           | Показывает общую энергию аудио. Низкая энергия может означать тихие или почти пустые файлы. Высокая RMS — сильный сигнал.                               |
| `peak`                     | Максимальная амплитуда                         | Помогает найти клиппированные аудио (например, >0.99 в нормализованном виде).                                                                           |
| `clip_fraction`            | Доля сэмплов, достигающих пиковой амплитуды    | Файлы с клиппингом искажают спектр.
| `zero_crossing_rate` (ZCR) | Количество пересечений нуля на единицу времени | Указывает на частоту колебаний сигнала. Очень высокий ZCR может указывать на шум или скрежет, низкий ZCR — на тишину/постоянный тон.                    |
| `spectral_centroid`  | Центр тяжести спектра (Hz)                             | Высокие значения → звук высокочастотный, низкие → басовые. Можно увидеть необычные спектры, которые выбиваются из нормальных голосов. |
| `spectral_bandwidth` | Ширина спектра                                         | Показывает разброс частот. Очень широкие → шум, узкие → тонкие сигналы.                                                               |
| `spectral_rolloff`   | Частота, ниже которой находится, например, 85% энергии | Помогает отделить «шумные» сигналы с большим количеством высоких частот.                                                              |
| `spectral_flatness`  | Мера «плоскости» спектра                               | Плоский спектр → шум; высокий пик → тон или голос.                                                                                    |


**MFCC (Mel-Frequency Cepstral Coefficients)**

Что такое: MFCC — это параметры, которые описывают спектр аудио с точки зрения восприятия человека. Обычно берут 13 коэффициентов.
- mean → среднее значение коэффициента во всём файле (общая «тональность» / тембр)
- std → разброс во времени (динамика звука)

MFCC является основной фичей для распознавания речи и анализа голоса. Позволяет фильтровать аномальные файлы: например, если mfcc_mean или mfcc_std выходят за нормальные диапазоны по корпусу, это может быть пустой или слишком шумный файл.

In [None]:
string outCsvPath = Path.Combine(dataRoot, "indexed_metadata_with_feats.csv");
var df = DataFrame.LoadCsv(outCsvPath, separator: ',', header: true);

In [None]:
df.Head(5)

index,id,filename,set,kind,path,noise_type,snr_db,filesize_bytes,duration_s,sample_rate,channels,rms,peak,clip_fraction,zero_crossing_rate,spectral_centroid,spectral_bandwidth,spectral_rolloff,spectral_flatness,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,mfcc_mean_9,mfcc_mean_10,mfcc_mean_11,mfcc_mean_12,mfcc_mean_13,mfcc_std_1,mfcc_std_2,mfcc_std_3,mfcc_std_4,mfcc_std_5,mfcc_std_6,mfcc_std_7,mfcc_std_8,mfcc_std_9,mfcc_std_10,mfcc_std_11,mfcc_std_12,mfcc_std_13
0,p226_001,p226_001.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_001.wav,babble,15,218924,2.28,48000,1,0.06707462,0.5036011,0,0.058470394,976.3967,1189.206,1607.3624,0.013145446,-10.384128,-0.98033506,-0.021782808,0.32324696,0.2429036,0.13629863,-0.3488801,-0.044110995,-0.18484814,-0.16092633,-0.22139503,-0.09813762,0.15506506,3.4309747,2.326395,1.0515844,1.2266172,0.92109823,0.69662714,0.44195265,0.60270494,0.4986998,0.4354456,0.4176389,0.36165,0.3302366
1,p226_002,p226_002.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_002.wav,babble,10,374444,3.9,48000,1,0.049257405,0.51260376,0,0.052911326,851.1434,1177.1655,1510.2599,0.00973583,-10.78378,0.040354215,-0.23455612,0.1306125,0.39881825,0.48707452,-0.5051819,-0.20962666,-0.059476364,-0.06041285,-0.3093314,-0.014944172,0.024915088,3.771162,2.4374876,1.1156713,1.134089,0.97345454,0.9818608,0.62296593,0.63001263,0.5767586,0.4728368,0.5102789,0.41236618,0.4334304
2,p226_003,p226_003.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_003.wav,babble,5,746378,7.7743125,48000,1,0.056875337,0.4954834,0,0.06189186,1072.3827,1236.5487,1935.4839,0.015855892,-8.8147955,0.02235686,-0.33204758,0.04918779,0.3412993,0.42244703,-0.54328513,-0.17840445,-0.21467881,-0.19004622,-0.33301586,0.039367486,0.009494899,2.7336206,2.746311,1.0762771,1.3061794,0.88410014,0.94357824,0.5803549,0.5855494,0.5928827,0.48015955,0.49156275,0.41243398,0.36181173
3,p226_004,p226_004.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_004.wav,babble,0,498284,5.19,48000,1,0.08473252,0.565979,0,0.04403099,657.4803,1029.9865,1216.823,0.0064087547,-6.488204,0.13705643,-0.041589238,0.27549487,0.2780855,0.3695626,-0.5107953,-0.20507029,-0.22200343,-0.09211039,-0.3249578,-0.0957229,-0.031364948,2.5021393,2.1014152,1.0663952,1.1773847,1.0710347,0.9106924,0.58795553,0.62504596,0.56913567,0.5431859,0.47902545,0.39824066,0.3914114
4,p226_005,p226_005.wav,noisy,Train,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_005.wav,cafeteria,15,712456,7.4209585,48000,1,0.056232303,0.5054016,0,0.05713267,913.1614,1257.1747,1540.6407,0.014083369,-10.520058,-1.0326481,-0.019842476,0.26742902,0.25042817,0.8912508,-0.33428255,-0.39493865,-0.16539182,-0.31315827,-0.08798532,-0.15483733,0.07737549,2.9474547,2.4083557,1.3010606,1.0804851,0.8114594,0.98351234,0.51869476,0.6524494,0.4379639,0.4624834,0.4816264,0.4007784,0.37780747


In [None]:
void ShowFeatureHistograms(DataFrame df, int cols = 3, int width = 1400, int heightPerRow = 350)
{
    var charts = new List<GenericChart>();

    var numericCols = df.Columns
        .Where(c => c.DataType == typeof(double)
                 || c.DataType == typeof(float)
                 || c.DataType == typeof(int)
                 || c.DataType == typeof(long)
                 || c.DataType == typeof(short)
                 || c.DataType == typeof(decimal))
        .Select(c => c.Name)
        .ToList();

    Console.WriteLine($"Found {numericCols.Count} numeric columns.");

    foreach (var colName in numericCols)
    {
        var col = df.Columns[colName];

        var values = new List<double>((int)col.Length);
        for (long i = 0; i < col.Length; i++)
        {
            var obj = col[i];
            if (obj == null || obj == DBNull.Value) continue;
            try
            {
                double v = Convert.ToDouble(obj);
                if (!double.IsNaN(v) && !double.IsInfinity(v))
                    values.Add(v);
            }
            catch
            {
                // пропустить нечисловые/непреобразуемые значения
            }
        }

        if (values.Count == 0) continue;

        var hist = Chart.Histogram<double, double, string>(values)
                        .WithTitle(colName)
                        .WithXAxisStyle(Title.init(Text: colName))
                        .WithYAxisStyle(Title.init(Text: "Count"));

        charts.Add(hist);
    }

    if (charts.Count == 0)
    {
        Console.WriteLine("No numeric charts to display.");
        return;
    }

    int rows = (int)Math.Ceiling(charts.Count / (double)cols);
    int height = Math.Max(600, rows * heightPerRow);

    var layout = Layout.init<double>(
        Width: FSharpOption<int>.Some(width),
        Height: FSharpOption<int>.Some(height),
        ShowLegend: FSharpOption<bool>.Some(false)
    );

    var grid = Chart.Grid(charts.ToArray(), rows, cols)
                    .WithLayout(layout)
                    .WithTitle("Распределение фичей");

    grid.Show();
}


In [None]:
ShowFeatureHistograms(df)

Found 39 numeric columns.


Задача - применить интерквантильный размах для нормализации распределения

In [None]:
double Percentile(List<double> sortedData, double percentile)
{
    double position = (sortedData.Count + 1) * percentile;
    int left = (int)Math.Floor(position) - 1;
    int right = left + 1;

    if (left < 0) return sortedData[0];
    if (right >= sortedData.Count) return sortedData[^1];

    double fraction = position - Math.Floor(position);
    return sortedData[left] + fraction * (sortedData[right] - sortedData[left]);
}

In [None]:
string[] colsToFilter = new[]
{
    "filesize_bytes",
    "duration_s",
    "zero_crossing_rate",
    "spectral_centroid",
    "spectral_bandwidth",
    "spectral_rolloff",
    "spectral_flatness"
};

foreach (var colName in colsToFilter)
{
    Console.WriteLine($"--- {colName} ---");

    var col = df[colName];

    // Собираем значения в double
    List<double> values = new((int)col.Length);
    for (long i = 0; i < col.Length; i++)
    {
        var obj = col[i];
        if (obj == null || obj == DBNull.Value) continue;
        values.Add(Convert.ToDouble(obj));
    }

    if (values.Count == 0) continue;

    // сортируем
    values.Sort();

    double Q1 = Percentile(values, 0.25);
    double Q3 = Percentile(values, 0.75);
    double IQR = Q3 - Q1;

    double upperBound = Q3 + 1.5 * IQR;

    Console.WriteLine($"Q1={Q1}, Q3={Q3}, IQR={IQR}, upperBound={upperBound}");

    // получаем булеву маску
    var mask = df[colName].ElementwiseLessThanOrEqual(upperBound);

    // применяем фильтрацию
    df = df.Filter(mask);
}


--- filesize_bytes ---
Q1=212746.5, Q3=318511.5, IQR=105765, upperBound=477159
--- duration_s ---
Q1=2.1968958377838135, Q3=3.2261667251586914, IQR=1.029270887374878, upperBound=4.770073056221008
--- zero_crossing_rate ---
Q1=0.03728971816599369, Q3=0.06342710182070732, IQR=0.02613738365471363, upperBound=0.10263317730277777
--- spectral_centroid ---
Q1=592.9073181152344, Q3=1035.387939453125, IQR=442.4806213378906, upperBound=1699.108871459961
--- spectral_bandwidth ---
Q1=913.0142822265625, Q3=1306.9985961914062, IQR=393.98431396484375, upperBound=1897.9750671386719
--- spectral_rolloff ---
Q1=990.1805725097656, Q3=1630.553466796875, IQR=640.3728942871094, upperBound=2591.112808227539
--- spectral_flatness ---
Q1=0.007010923931375146, Q3=0.01546744629740715, IQR=0.008456522366032004, upperBound=0.028152229846455157


In [None]:
ShowFeatureHistograms(df)

Found 39 numeric columns.


In [None]:
string outCsvPath = Path.Combine(dataRoot, "indexed_metadata_with_feats_clean.csv");
DataFrame.SaveCsv(df, outCsvPath, ',', header: true, encoding: System.Text.Encoding.UTF8, cultureInfo: CultureInfo.InvariantCulture);

# Собираем в два датасета и группируем по парам

In [4]:
using Microsoft.ML;
using Microsoft.ML.Data;

In [5]:
public class RawAudioRow
{
    [LoadColumn(0)] public string Id { get; set; } = string.Empty;
    [LoadColumn(1)] public string FileName { get; set; } = string.Empty;
    [LoadColumn(2)] public string Set { get; set; } = string.Empty;   // "noisy" или "clean"
    [LoadColumn(3)] public string Kind { get; set; } = string.Empty;  // "Train" или "Test"
    [LoadColumn(4)] public string Path { get; set; } = string.Empty;
    [LoadColumn(8)] public float Duration_s { get; set; } = 0f;
}


In [17]:
public class PairAudioRow
{
    public string Id { get; set; } = string.Empty;
    public string FileName { get; set; } = string.Empty;
    public string CleanPath { get; set; } = string.Empty;
    public string NoisyPath { get; set; } = string.Empty;
    public float CleanDuration_s { get; set; }
    public float NoisyDuration_s { get; set; }
    public string Kind { get; set; } = string.Empty; // Train / Test
}

In [26]:
string inputCsv = @"Z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\indexed_metadata_with_feats_clean.csv";
string outputTrainCsv = "train_audio.csv";
string outputTestCsv = "test_audio.csv"; // как вы попросили

var mlContext = new MLContext();

// Загружаем CSV в IDataView
var dataView = mlContext.Data.LoadFromTextFile<RawAudioRow>(
    path: inputCsv,
    hasHeader: true,
    separatorChar: ','
);

In [27]:
var rows = mlContext.Data.CreateEnumerable<RawAudioRow>(dataView, reuseRowObject: false).ToList();

var grouped = rows.GroupBy(r => r.Id);

var pairs = new List<PairAudioRow>();

foreach (var g in grouped)
{
    var clean = g.FirstOrDefault(r => string.Equals(r.Set, "clean", StringComparison.OrdinalIgnoreCase));
    var noisy = g.FirstOrDefault(r => string.Equals(r.Set, "noisy", StringComparison.OrdinalIgnoreCase));

    if (clean == null || noisy == null)
    {
        // Пропускаем неполные пары
        continue;
    }

    var pair = new PairAudioRow
    {
        Id = g.Key,
        FileName = clean.FileName ?? noisy.FileName,
        CleanPath = clean.Path,
        NoisyPath = noisy.Path,
        CleanDuration_s = clean.Duration_s,
        NoisyDuration_s = noisy.Duration_s,
        Kind = clean.Kind ?? noisy.Kind
    };

    pairs.Add(pair);
}

// Разделяем на Train / Test
var train = pairs.Where(p => string.Equals(p.Kind, "Train", StringComparison.OrdinalIgnoreCase)).ToList();
var test = pairs.Where(p => string.Equals(p.Kind, "Test", StringComparison.OrdinalIgnoreCase)).ToList();

In [28]:
train[0]

Unnamed: 0,Unnamed: 1
Id,p226_002
FileName,p226_002.wav
CleanPath,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\clean_trainset_28spk_wav\p226_002.wav
NoisyPath,z:\Code\Python\ipynb\NoiseReduction\data\edinbourg\noisy_trainset_28spk_wav\p226_002.wav
CleanDuration_s,3.9
NoisyDuration_s,3.9
Kind,Train


In [29]:
var trainData = mlContext.Data.LoadFromEnumerable<PairAudioRow>(train);
var testData = mlContext.Data.LoadFromEnumerable<PairAudioRow>(test);

In [32]:
using (var stream = File.Create(outputTrainCsv))
{
    mlContext.Data.SaveAsText(
        trainData,
        stream,
        separatorChar: ',',
        schema: false
    );
}

In [33]:
using (var stream = File.Create(outputTestCsv))
{
    mlContext.Data.SaveAsText(
        testData,
        stream,
        separatorChar: ',',
        schema: false
    );
}