In [25]:
# Architectural constants.
NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
EMBEDDING_SIZE = 128  # Size of embedding layer.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
MEL_MIN_HZ = 125
MEL_MAX_HZ = 7500
LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.

# Parameters used for embedding postprocessing.
PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
PCA_MEANS_NAME = 'pca_means'
QUANTIZE_MIN_VAL = -2.0
QUANTIZE_MAX_VAL = +2.0

# Hyperparameters used in training.
INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.

# Names of ops, tensors, and features.
INPUT_OP_NAME = 'vggish/input_features'
INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
OUTPUT_OP_NAME = 'vggish/embedding'
OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'

In [4]:
from tqdm import tqdm
import torch
import os
import cv2

video_path = '../dataset/videos/video_5k/train_5k/'
file_names = os.listdir(video_path)
save_path = '../dataset/eff_frames/train_5k/'

In [9]:
example_file = file_names[0]
output_audio = example_file.replace('.mp4','.wav')

command = 'ffmpeg -loglevel error -i '+video_path+example_file+' '+output_audio
os.system(command)

0

In [15]:
import soundfile as sf
import resampy
wav_data, sr = sf.read(output_audio, dtype='int16')

In [21]:
samples = wav_data / 32768.0 # 归一化

In [28]:
import numpy as np
data = np.mean(samples,axis=1) # 将双声道数据整合成单通道

In [29]:
if sr!= SAMPLE_RATE:
    data = resampy.resample(data,sr,SAMPLE_RATE)

In [31]:
from utils.audio import mel_features 

In [32]:
log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=SAMPLE_RATE,
      log_offset=LOG_OFFSET,
      window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=NUM_MEL_BINS,
      lower_edge_hertz=MEL_MIN_HZ,
      upper_edge_hertz=MEL_MAX_HZ)

In [34]:
log_mel.shape

(5991, 64)

In [35]:
features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
example_window_length = int(round(EXAMPLE_WINDOW_SECONDS * features_sample_rate))
example_hop_length = int(round(EXAMPLE_HOP_SECONDS * features_sample_rate))
log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)

In [37]:
log_mel_examples.shape

(62, 96, 64)