In [1]:
import sys
sys.path.insert(0, '/home/ons21553/wspace/interview-transcripts/src')
from common import *

%load_ext autoreload
%autoreload 2

In [16]:
import pandas as pd
import numpy as np
from IPython.display import display

## Load some data

In [3]:
recording = 'bbc-interview'
n_speakers = 2

audio_fpath = from_data_root(f'recordings/{recording}/audio.wav')
transcript = from_data_root(f'recordings/{recording}/transcript.txt')

##### Check out the audio

In [4]:
from pydub import AudioSegment
audio = AudioSegment.from_wav(audio_fpath)
audio

In [5]:
audio[:59000].export(from_data_root(f'recordings/{recording}/audio_1m.wav'), format='wav')

<_io.BufferedRandom name='/home/ons21553/wspace/interview-transcripts/src/../data/recordings/bbc-interview/audio_1m.wav'>

In [21]:
audio_fpath = '/home/ons21553/wspace/interview-transcripts/pyAudioAnalysis/pyAudioAnalysis/data/diarizationExample.wav'

## Inspect the audio - basic properties

In [7]:
import librosa

In [8]:
# * Waveform is merely a graph that displays amplitude or level changes over time
# * Sampling rate: Audio CDs, for example, have a sample rate of 44.1kHz, 
# which means that the analog signal is sampled 44,100 times per second

audio_waveform, sampling_rate = librosa.load(audio_fpath)

In [9]:
print('Audio as waveform')
print(f'Shape: {audio_waveform.shape}')
print(f'Glimpse at few values: {audio_waveform[:10]}')

print()
print(f'Sampling rate: {sampling_rate}')

print()
sec_len = len(audio_waveform) / sampling_rate
print(f'Duration is thus {sec_len} sec: {sec_len // 60} minutes and {sec_len % 60:.2f} seconds')

Audio as waveform
Shape: (6055982,)
Glimpse at few values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Sampling rate: 22050

Duration is thus 274.6477097505669 sec: 4.0 minutes and 34.65 seconds


## Segmentation

In [10]:
from inaSpeechSegmenter import Segmenter, seg2csv

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
seg = Segmenter()

W0815 14:52:20.854848 140638076045120 deprecation_wrapper.py:119] From /home/ons21553/.local/share/virtualenvs/src-ppTQaxQI/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0815 14:52:20.876023 140638076045120 deprecation_wrapper.py:119] From /home/ons21553/.local/share/virtualenvs/src-ppTQaxQI/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0815 14:52:20.891334 140638076045120 deprecation_wrapper.py:119] From /home/ons21553/.local/share/virtualenvs/src-ppTQaxQI/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0815 14:52:20.928360 140638076045120 deprecation_wrapper.py:119] From /home/ons21553/.local/share/virtualenvs/src-ppTQaxQI/lib/python3.6/site-packages/keras/

In [23]:
segmentation = seg(audio_fpath)

  for e, c in six.moves.zip(emission.T, consecutive)
  for e, c in six.moves.zip(constraint.T, consecutive)


In [24]:
segments = [
    {
        'type': s[0],
        'start_s': s[1],
        'end_s': s[2],
        'audio': audio[s[1]*1000:s[2]*1000]
    } for s in segmentation
]

for s in segments[:10]:
    print({k:v for k, v in s.items() if k != 'audio'})
    display(s['audio'])

{'type': 'Male', 'start_s': 0.0, 'end_s': 18.26}


{'type': 'Female', 'start_s': 18.26, 'end_s': 27.400000000000002}


{'type': 'Male', 'start_s': 27.400000000000002, 'end_s': 41.9}


## Speaker diarization

In [249]:
from pyAudioAnalysis import audioSegmentation as audio_seg

In [282]:
# returns a numpy array with the speaker ID (or zero, if no one speaking) for each
# mid-term window (default 2s wide, step size 0.2s)

mt_size = 1
mt_step = 0.1

win2speaker_id = audio_seg.speakerDiarization(
    audio_fpath, 
    n_speakers=2,
    mt_size=mt_size,
    mt_step=mt_step,
    lda_dim=0
)
win2speaker_id

array([1., 1., 1., ..., 1., 1., 1.])

In [283]:
df_diary = pd.DataFrame({
    'speaker_id': win2speaker_id,
    'time_s': [mt_step*i for i in range(len(win2speaker_id))],
}, index=range(len(win2speaker_id)))

print(f'Shape: {df_diary.shape}')
print(f'No. unique speakers: {df_diary["speaker_id"].nunique()}')
print('Speaker ID value counts:\n{}'.format(df_diary['speaker_id'].value_counts()))
df_diary.head()

Shape: (2748, 2)
No. unique speakers: 2
Speaker ID value counts:
1.0    1696
0.0    1052
Name: speaker_id, dtype: int64


Unnamed: 0,speaker_id,time_s
0,1.0,0.0
1,1.0,0.1
2,1.0,0.2
3,1.0,0.3
4,1.0,0.4


In [285]:
# compute segments - intervals when one speaker was speaking
df_diary['prev_speaker_id'] = df_diary.shift()['speaker_id']
df_segments = df_diary[df_diary['speaker_id'] != df_diary['prev_speaker_id']]
df_segments['end_time_s'] = df_segments.shift(-1)['time_s'].fillna(audio.duration_seconds)
df_segments = df_segments.rename(columns={'time_s': 'start_time_s'})
df_segments = df_segments[['speaker_id', 'start_time_s', 'end_time_s']]
print(df_segments.shape)
df_segments.head()

(87, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,speaker_id,start_time_s,end_time_s
0,1.0,0.0,10.4
104,0.0,10.4,11.4
114,1.0,11.4,17.3
173,0.0,17.3,18.7
187,1.0,18.7,22.2


In [293]:
for i in range(0, 5):
    speaker_id, start_s, end_s = df_segments.iloc[i][
        ['speaker_id', 'start_time_s', 'end_time_s']
    ]
    start_ms, end_ms = start_s*1000, end_s*1000
    print(f'Speaker {speaker_id:.0f}: {start_s}...{end_s}')
    display(audio[start_ms:end_ms])

Speaker 1: 0.0...10.4


Speaker 0: 10.4...11.4


Speaker 1: 11.4...17.3


Speaker 0: 17.3...18.7


Speaker 1: 18.7...22.200000000000003
