In [14]:
import os
import textgrid
import numpy as np
import librosa
import webrtcvad
import collections
import contextlib
import wave
import sys
from pydub import AudioSegment
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, BatchNormalization, GRU
from keras.optimizers import Adam
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def load_textgrid(path):
    tg = textgrid.TextGrid.fromFile(path)
    segments = []
    for tier in tg.tiers:
        if tier.name == 'trans':
            for interval in tier.intervals:
                segments.append((interval.minTime, interval.maxTime, interval.mark))
    return segments

def convert_wav(input_path: str, output_path: str, target_sample_rate=16000):
    audio = AudioSegment.from_wav(input_path)
    audio = audio.set_frame_rate(target_sample_rate)
    audio = audio.set_channels(1)
    audio.export(output_path, format="wav")

def load_wav(path, sample_rate=16000):
    y, sr = librosa.load(path, sr=sample_rate)
    return y, sr

def extract_segments(wav_path, tg_path, segment_length=16000):
    y, sr = load_wav(wav_path)
    segments = load_textgrid(tg_path)
    data = []
    labels = []
    for start, end, label in segments:
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        segment = y[start_sample:end_sample]
        if len(segment) < segment_length:
            segment = np.pad(segment, (0, max(0, segment_length - len(segment))), 'constant')
        else:
            segment = segment[:segment_length]
        data.append(segment)
        labels.append(1 if label != '#' else 0)  # 1 pour parole, 0 pour silence
    return data, labels

def extract_features(segment, sr=16000, n_mfcc=13):
    n_fft = min(len(segment), 512)
    hop_length = n_fft // 2
    
    mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    chroma = librosa.feature.chroma_stft(y=segment, sr=sr, n_fft=n_fft, hop_length=hop_length)
    mel = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=n_fft, hop_length=hop_length)
    contrast = librosa.feature.spectral_contrast(y=segment, sr=sr, n_fft=n_fft, hop_length=hop_length)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(segment), sr=sr)
    
    features = np.hstack((np.mean(mfccs.T, axis=0),
                          np.mean(chroma.T, axis=0),
                          np.mean(mel.T, axis=0),
                          np.mean(contrast.T, axis=0),
                          np.mean(tonnetz.T, axis=0)))
    return features

def frame_generator(frame_duration_ms, audio, sample_rate):
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n

class Frame(object):
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1, "Le fichier doit être mono"
        sample_width = wf.getsampwidth()
        assert sample_width == 2, "La largeur d'échantillon doit être de 2 octets"
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000), f"Taux d'échantillonnage non supporté: {sample_rate}"
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate

def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        sys.stdout.write('1' if is_speech else '0')
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
    sys.stdout.write('\n')
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])

tg_wav_dir = '../../TEXTGRID_WAV/'
data = []
labels = []
converted_wav_files = []

for folder in os.listdir(tg_wav_dir):
    if os.path.isdir(os.path.join(tg_wav_dir, folder)):
        for file in os.listdir(os.path.join(tg_wav_dir, folder)):
            if file.endswith('_MG.TextGrid'):
                tg_path = os.path.join(tg_wav_dir, folder, file)
                wav_path = tg_path.replace('_MG.TextGrid', '_MG.wav')
                converted_wav_path = wav_path.replace('.wav', '_16000Hz.wav')
                convert_wav(wav_path, converted_wav_path)
                d, l = extract_segments(converted_wav_path, tg_path)
                data.extend(d)
                labels.extend(l)
                converted_wav_files.append(converted_wav_path)

data = np.array(data)
labels = np.array(labels)

print("Extraction des caractéristiques...")
features = np.array([extract_features(segment) for segment in data])

print("Séparation des données en ensembles d'entraînement et de test...")
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Créer un modèle RNN plus complexe
print("Création du modèle...")
model = Sequential()
print("Ajout des couches...")
model.add(GRU(256, return_sequences=True, input_shape=(1, X_train.shape[2])))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(GRU(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compiler le modèle avec un taux d'apprentissage ajusté
optimizer = Adam(learning_rate=0.0001)
print("Compilation du modèle...")
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test))

predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

print(classification_report(y_test, predictions))

def save_textgrid(segments, output_path, wav_path):
    y, sr = librosa.load(wav_path, sr=None)
    max_time = len(y) / sr

    new_tg = textgrid.TextGrid()
    interval_tier = textgrid.IntervalTier(name="IPUs", minTime=0, maxTime=max_time)
    new_tg.append(interval_tier)

    for i, (start, end) in enumerate(segments):
        if start < end:
            interval_tier.add(start, end, f"IPUs_{i}")

    new_tg.write(output_path)
    print(f"Nouveau fichier TextGrid sauvegardé: {output_path}")

def predict_segments(wav_path, model, sr=16000, frame_size=2048, hop_length=512):
    y, _ = load_wav(wav_path, sr)
    frames = librosa.util.frame(y, frame_length=frame_size, hop_length=hop_length)
    frames = frames.T
    features = np.array([extract_features(frame, sr) for frame in frames])
    features = features.reshape((features.shape[0], 1, features.shape[1]))
    predictions = model.predict(features)
    return predictions

# Utilisation de WebRTC VAD pour détecter les segments de parole et de silence
def detect_silences_vad(wav_path, aggressiveness=3, frame_duration_ms=10, padding_duration_ms=100):
    audio, sample_rate = read_wave(wav_path)
    vad = webrtcvad.Vad(aggressiveness)
    frames = frame_generator(frame_duration_ms, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames)
    detected_segments = []
    for segment in segments:
        segment_length = len(segment) / 2 # Durée en secondes
        start_time = (audio.index(segment) / 2) / sample_rate # Début en secondes
        end_time = start_time + (segment_length / sample_rate) # Fin en secondes
        detected_segments.append((start_time, end_time)) # Ajouter le segment détecté

    # Fusionner les segments chevauchants ou adjacents
    merged_segments = []
    for start, end in detected_segments:
        if not merged_segments:
            merged_segments.append([start, end])
        else:
            prev_start, prev_end = merged_segments[-1]
            if start < prev_end:
                merged_segments[-1][1] = max(prev_end, end)
            else:
                merged_segments.append([start, end])

    return merged_segments

wav_path = converted_wav_files[0]
print(f"Chemin du fichier wav: {wav_path}")

if not os.path.exists(wav_path):
    print(f"Le fichier {wav_path} n'existe pas. Veuillez fournir un chemin valide.")
else:
    print("Prédiction des segments...")
    predictions = predict_segments(wav_path, model)

    frame_duration = 2048 / 16000
    detected_segments = []
    for i, pred in enumerate(predictions):
        if pred > 0.5:
            start_time = i * frame_duration
            end_time = start_time + frame_duration
            detected_segments.append((start_time, end_time))

    y, sr = librosa.load(wav_path, sr=None)
    max_time = len(y) / sr

    merged_segments = []
    for start, end in detected_segments:
        if start > max_time:
            break
        end = min(end, max_time)
        if not merged_segments:
            merged_segments.append([start, end])
        else:
            prev_start, prev_end = merged_segments[-1]
            if start <= prev_end:
                merged_segments[-1][1] = max(prev_end, end)
            else:
                merged_segments.append([start, end])

    print("Segments fusionnés:", merged_segments)

    output_textgrid_path = tg_wav_dir + "nouveau_fichier_detected.TextGrid"
    save_textgrid(merged_segments, output_textgrid_path, wav_path)


Extraction des caractéristiques...


  return pitch_tuning(


Séparation des données en ensembles d'entraînement et de test...
Création du modèle...
Ajout des couches...
Compilation du modèle...
Epoch 1/30


  super().__init__(**kwargs)


[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7552 - loss: 0.5051 - val_accuracy: 0.9356 - val_loss: 0.1689
Epoch 2/30
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9409 - loss: 0.1807 - val_accuracy: 0.9580 - val_loss: 0.1259
Epoch 3/30
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9539 - loss: 0.1375 - val_accuracy: 0.9614 - val_loss: 0.1117
Epoch 4/30
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9551 - loss: 0.1276 - val_accuracy: 0.9623 - val_loss: 0.1109
Epoch 5/30
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9617 - loss: 0.1168 - val_accuracy: 0.9637 - val_loss: 0.1069
Epoch 6/30
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9639 - loss: 0.1139 - val_accuracy: 0.9649 - val_loss: 0.1005
Epoch 7/30
[1m274/274[0m [32m━━━━━━━



[1m356/356[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Segments fusionnés: [[8.832, 9.344000000000001], [10.624, 12.032], [12.544, 15.360000000000001], [15.488, 16.768], [17.792, 20.48], [20.608, 21.504], [21.76, 24.832], [25.6, 25.856], [26.112000000000002, 29.952], [30.080000000000002, 30.848], [31.36, 37.12], [37.888, 39.04], [39.296, 48.64], [48.896, 56.704], [56.832, 58.496], [59.52, 62.08], [62.208, 64.512], [64.896, 73.60000000000001], [74.368, 76.8], [76.928, 82.432], [83.072, 90.24], [90.88, 95.744], [96.0, 97.28], [97.536, 97.792], [98.048, 100.096], [100.48, 101.248], [101.376, 105.72800000000001], [106.24000000000001, 106.88], [107.136, 110.592], [110.848, 128.128], [128.12800000000001, 128.384], [128.38400000000001, 128.64], [128.64000000000001, 128.896], [129.536, 129.664], [129.66400000000002, 129.92], [129.92000000000002, 130.176], [130.17600000000002, 130.432], [130.43200000000002, 130.688], [130.68800000000002, 130.944], [130.94400000000002, 131.32