In [181]:
import os
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
import pathlib
import librosa.display
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

import librosa

In [182]:
# Get the data directories
data_dir = "./"
#os.listdir(data_dir)

In [183]:
# get wav paths
def get_wav_paths(speaker):
    speaker_path = data_dir + speaker
    all_paths = [item for item in os.listdir(speaker_path)]
    return all_paths

In [184]:
Putin_paths = get_wav_paths("Putin")
Elcin_paths = get_wav_paths("Elcin")
#print(Elcin_paths)


In [185]:
# load the data
def load_wav(wav_path, speaker):
    with tf.compat.v1.Session(graph=tf.compat.v1.Graph()) as sess:
        wav_path = data_dir + speaker + "/" + wav_path
        wav_filename_placeholder = tf.compat.v1.placeholder(tf.compat.v1.string, [])
        wav_loader = tf.io.read_file(wav_filename_placeholder)
        wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1)
        #wav_data = sess.run(
        #    wav_decoder, feed_dict={
        #        wav_filename_placeholder: wav_path
        #    }).audio.flatten().reshape((1, 16000))
        wav_data = sess.run(
            wav_decoder, feed_dict={
                wav_filename_placeholder: wav_path
            }).audio.flatten().reshape((1, 16000))
        sess.close()
    return wav_data

In [186]:
# create training data
def generate_training_data(speaker_paths, speaker, label):
    wavs, labels = [], []
    for i in tqdm(speaker_paths):
        wav = load_wav(i, speaker)
        wavs.append(wav)
        labels.append(label)
    return wavs, labels

In [187]:
Putin_wavs, Putin_labels = generate_training_data(Putin_paths, "Putin", 0) 
Elcin_wavs, Elcin_labels = generate_training_data(Elcin_paths, "Elcin", 1) 

100%|█████████████████████████████████████████████████████████████████████████████| 1652/1652 [00:06<00:00, 262.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1386/1386 [00:05<00:00, 261.59it/s]


In [188]:
#Не нужно для нейросети, просто информация о данных
def print_stats(waveform, sample_rate=None, src=None):
  if src:
    print("-" * 10)
    print("Source:", src)
    print("-" * 10)
  if sample_rate:
    print("Sample Rate:", sample_rate)
  print("Shape:", tuple(waveform.shape))
  print("Dtype:", waveform.dtype)
  print(f" - Max:     {waveform.max().item():6.3f}")
  print(f" - Min:     {waveform.min().item():6.3f}")
  print(f" - Mean:    {waveform.mean().item():6.3f}")
  print(f" - Std Dev: {waveform.std().item():6.3f}")
  print()
  print(waveform)
  print()
def print_metadata(metadata, src=None):
  if src:
    print("-" * 10)
    print("Source:", src)
    print("-" * 10)
  print(" - sample_rate:", metadata.sample_rate)
  print(" - num_channels:", metadata.num_channels)
  print(" - num_frames:", metadata.num_frames)
  print(" - bits_per_sample:", metadata.bits_per_sample)
  print(" - encoding:", metadata.encoding)
  print()

waveform, sample_rate = torchaudio.load('./Elcin/0.wav')
print_stats(waveform, sample_rate=sample_rate)

metadata = torchaudio.info('./Elcin/0.wav')
print_metadata(metadata, src='./Elcin/0.wav')

metadata = torchaudio.info('./Putin/0.wav')
print_metadata(metadata, src='./Putin/0.wav')

Sample Rate: 16000
Shape: (1, 16000)
Dtype: torch.float32
 - Max:      0.616
 - Min:     -0.601
 - Mean:     0.000
 - Std Dev:  0.108

tensor([[-3.0518e-05, -3.9673e-04, -3.3569e-04,  ...,  9.7717e-02,
          8.1818e-02,  6.1951e-02]])

----------
Source: ./Elcin/0.wav
----------
 - sample_rate: 16000
 - num_channels: 1
 - num_frames: 16000
 - bits_per_sample: 16
 - encoding: PCM_S

----------
Source: ./Putin/0.wav
----------
 - sample_rate: 16000
 - num_channels: 1
 - num_frames: 16000
 - bits_per_sample: 16
 - encoding: PCM_S



In [189]:
all_wavs = Putin_wavs + Elcin_wavs
all_labels = Putin_labels + Elcin_labels

In [190]:

final_wavs = np.array(all_wavs)
final_labels = np.array(all_labels)

print(final_wavs.shape, final_labels.shape)

(3038, 1, 16000) (3038,)


In [191]:
train_wavs, test_wavs, train_labels, test_labels = train_test_split(final_wavs, final_labels, test_size=0.1)

In [192]:
train_x, train_y = np.array(train_wavs), np.array(train_labels)
test_x, test_y = np.array(test_wavs), np.array(test_labels)

In [193]:
train_y = tf.keras.utils.to_categorical(train_y)
test_y = tf.keras.utils.to_categorical(test_y)

In [196]:
train_x_new = []
test_x_new = []
INPUT_SHAPE = (126,40)

train_x_new = np.zeros((train_x.shape[0], INPUT_SHAPE[0], INPUT_SHAPE[1]), dtype=np.float64)

count = 0
for sample in train_x:
    mfcc = librosa.feature.mfcc(y=sample, sr=16000, hop_length=128, n_fft=256, n_mfcc=20)
    mfcc_delta = librosa.feature.delta(mfcc)[:10, :]
    mfcc_double_delta = librosa.feature.delta(mfcc, order=2)[:10, :]
    train_x_new[count, :, :20] = mfcc.T
    train_x_new[count, :, 20:30] = mfcc_delta.T
    train_x_new[count, :, 30:] = mfcc_double_delta.T
    count += 1
    if count%500 == 0:
        print('Train', count)
        
test_x_new = np.zeros((test_x.shape[0], INPUT_SHAPE[0], INPUT_SHAPE[1]), dtype=np.float64)

count = 0
for sample in test_x:
    mfcc = librosa.feature.mfcc(y=sample, sr=16000, hop_length=128, n_fft=256, n_mfcc=20)
    mfcc_delta = librosa.feature.delta(mfcc)[:10, :]
    mfcc_double_delta = librosa.feature.delta(mfcc, order=2)[:10, :]
    test_x_new[count, :, :20] = mfcc.T
    test_x_new[count, :, 20:30] = mfcc_delta.T
    test_x_new[count, :, 30:] = mfcc_double_delta.T
    count += 1
    if count%500 == 0:
        print('Test', count)

ParameterError: Invalid shape for monophonic audio: ndim=2, shape=(1, 16000)