In [1]:
# All the audio processing tools are wrapped in the Python Module called AudioPipe
import AudioPipe.speaker.recognition as SR # Speaker Recognition Module
import AudioPipe.fingerprint.panako as FP # Accoustic Fingerprinting Module
from AudioPipe.speaker.silence import remove_silence # tool for remove the silence in the audio, not needed
import numpy as np
from AudioPipe.features import mfcc # Feature Extraction Module, part of the shared preprocessing
import scipy.io.wavfile as wav 
from AudioPipe.speaker.rec import dia2spk, getspk # Speaker Recognition using diarization results
from AudioPipe.utils.utils import video2audio # Format converting module, part of the shared preprocessing
import commands, os
from AudioPipe.diarization.diarization import Diarization # Speaker Diarization Moudule
import AudioPipe.data.manage as DM # Data Management Module

In [2]:
# Select the video file to be processed
Video_node = DM.Node("Data/Video/",".mp4")
name = "2015-08-07_0050_US_FOX-News_US_Presidential_Politics"

(0, '')


In [3]:
# Select the file for the meta infomation
Meta_node = DM.Node("Data/RedHen/",".seg")
meta = Meta_node.Pick(name)

(0, '')


In [4]:
# Store the fingerprint of the video
FP_node= DM.Node("Data/Fingerprint/")
output, err, exitcode = Video_node.Flow(FP.Store, name, FP_node, [])

(0, '')


In [5]:
# Convert the video to audio
Audio_node = DM.Node("Data/Audio/", ".wav")
audio = Video_node.Flow(video2audio, name, Audio_node, [Audio_node.ext])

(0, '')


In [6]:
Dia_node = DM.Node("Data/Diarization/", ".rttm")
args = dict(init_cluster=20, dest_mfcc='Data/MFCC', dest_cfg="Data/Model/DiaCfg")
dia =  Audio_node.Flow(Diarization, name, Dia_node, args)

(0, '')
(0, '')
(0, '')
(0, '')
(0, '')
(0, '')
Now start extracting mfcc features
(0, "ffmpeg version 2.8.2 Copyright (c) 2000-2015 the FFmpeg developers\n  built with gcc 4.9.3 (GCC)\n  configuration: --prefix=/usr/local/ffmpeg/2.8.2 --enable-shared\n  libavutil      54. 31.100 / 54. 31.100\n  libavcodec     56. 60.100 / 56. 60.100\n  libavformat    56. 40.101 / 56. 40.101\n  libavdevice    56.  4.100 / 56.  4.100\n  libavfilter     5. 40.101 /  5. 40.101\n  libswscale      3.  1.101 /  3.  1.101\n  libswresample   1.  2.101 /  1.  2.101\nGuessed Channel Layout for  Input Stream #0.0 : stereo\nInput #0, wav, from 'Data/Audio/2015-08-07_0050_US_FOX-News_US_Presidential_Politics.wav':\n  Metadata:\n    encoder         : Lavf56.40.101\n  Duration: 02:09:53.41, bitrate: 1411 kb/s\n    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 44100 Hz, 2 channels, s16, 1411 kb/s\nOutput #0, wav, to '/home/hxx124/Pipeline/Audio/Pipeline/AudioPipe/diarization/temp_audio/2015-08-07_0050_US_FOX-

In [7]:
# Below you can find an example on training a fresh model
# However, please make sure that the sample rate of your training data is the same with your testing data
model_gender_dir = "Data/Model/Gender/"
model_gender_nm = 'gender.model'
model_gender = model_gender_dir+model_gender_nm
if not os.path.isfile(model_gender):
    Gender = SR.GMMRec() # Create a new recognizer
    female_fn = model_gender_dir+'female.wav' # choose the training file for female
    male_fn = model_gender_dir+'male.wav' # choose the training file for male
    Gender.enroll_file('Female', female_fn) # enroll the female audio 
    Gender.enroll_file('Male', male_fn) # enroll the male audio 
    Gender.train() # train the GMMs after you enroll all the training data 
    Gender.dump(model_gender) # save the trained model into a file named "gender.model" for future use

In [8]:
# Gender Identification based on Speaker Diarization
Gen_node = DM.Node("Data/Gender/",".gen")
gen = Audio_node.Flow(dia2spk, name, Gen_node, [model_gender, dia, meta, Gen_node.ext])

(0, '')
(0, '')


In [9]:
# Gender Identification without Speaker Diarization
Genr_node = DM.Node("Data/Gender/",".genr")
gen = Audio_node.Flow(getspk, name, Genr_node, [model_gender, meta, Genr_node.ext])

(0, '')
(0, '')


In [10]:
# you can train a fresh model as follows:
model_speaker_dir = "Data/Model/Speaker/" # The training data should already be here!! 
model_speaker_nm = 'speaker.model'
model_speaker = model_speaker_dir+model_speaker_nm
if not os.path.isfile(model_speaker):
    Speaker = SR.GMMRec() # Create a new recognizer
    other_fn = model_speaker_dir+'Imposter.wav' # choose the training file for female
    trump_fn = model_speaker_dir+'Trump.wav' # choose the training file for male
    Speaker.enroll_file('Other', other_fn) # enroll the female audio 
    Speaker.enroll_file('Trump', trump_fn) # enroll the male audio 
    # You can add more speakers here following the syntax above
    Speaker.train() # train the GMMs after you enroll all the training data 
    Speaker.dump(model_speaker) # save the trained model into a file named "gender.model" for future use

In [11]:
# Speaker Recognition based on Speaker Diarization
Spk_node = DM.Node("Data/Speaker/",".spk")
spk = Audio_node.Flow(dia2spk, name, Spk_node, [model_speaker, dia, meta, Spk_node.ext])

(0, '')
(0, '')


In [12]:
# Speaker Recognition without Speaker Diarization
Spkr_node = DM.Node("Data/Speaker/",".spkr")
spkr = Audio_node.Flow(getspk, name, Spkr_node, [model_speaker, meta, Spkr_node.ext])

(0, '')
(0, '')
