In [1]:
from pydub import AudioSegment, silence
from mutagen.mp3 import MP3
from pathlib import Path
import pandas as pd
from utils import *
from audio_utils import *
import logging
import shutil
import re

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


END_OF_CONV_WORDS = ["au revoir", "bon courage", "bonne soirée", "bonne jounrée", "bye", "goodbye"]
START_OF_CONV_WORDS = ["bonjour", "bon jour", "hello", "hi", "good morning"]
DATA_FOLDER = Path("../audio_database/ch30_test/")

transcription_folder = DATA_FOLDER / 'raw_transcriptions'

In [None]:
import librosa
import noisereduce as nr
import numpy as np

def load_audio(file_path):
    # Loading audio file with librosa
    audio, sr = librosa.load(str(file_path), sr=None)
    return audio, sr

def reduce_noise(audio, sr):
    # Reduce noise using the noisereduce library
    return nr.reduce_noise(y=audio, sr=sr)

def normalize_volume(audio):
    # Normalize audio to have a consistent volume
    rms = np.sqrt(np.mean(audio**2))
    return audio / rms

def segment_audio(audio, sr, segment_length=5):
    # Split audio into 5-second segments
    buffer = segment_length * sr
    segments = [audio[i:i+buffer] for i in range(0, len(audio), buffer)]
    return segments

def resample_audio(audio, sr, target_sr=16000):
    # Resample audio to the target sample rate (e.g., 16kHz)
    return librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

def final_preprocessing(audio, sr):
    # Additional preprocessing steps if needed
    # For Faster Whisper, this might involve format conversion
    # This is just a placeholder function
    return audio

def preprocess_audio(file_path, output_file_path):
    audio, sr = load_audio(file_path)
    audio = reduce_noise(audio, sr)
    audio = normalize_volume(audio)
    # Only use segmentation if necessary
    # audio_segments = segment_audio(audio, sr)
    audio = resample_audio(audio, sr)
    audio = final_preprocessing(audio, sr)
    save_audio_segment(output_file_path, audio, sr)
    return audio


data_path = Path('../audio_database/test_preprocess/preprocessed_first_transcription/')
for file in data_path.glob('*.mp3'):
    preprocess_audio(file, file)

In [None]:
def remove_silence_with_crossfade(audio_path, output_path, silence_thresh=-50, min_silence_len=1000, silence_chunk_len=500, crossfade_len=100):
    """
    Removes or shortens silence from an audio file with crossfade for smoother transitions.
    
    :param audio_path: Path to the input audio file.
    :param output_path: Path to save the modified audio file.
    :param silence_thresh: The threshold in dBFS considered as silence. Default is -50 dBFS.
    :param min_silence_len: Minimum length of silence to be considered for removal in milliseconds. Default is 1000 ms.
    :param silence_chunk_len: Length to which silence should be reduced in milliseconds. Default is 500 ms.
    :param crossfade_len: Length of the crossfade in milliseconds. Default is 50 ms.
    """
    audio = AudioSegment.from_file(audio_path)

    # Detect non-silent chunks
    nonsilent_chunks = silence.detect_nonsilent(
        audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh
    )

    # Process each chunk with crossfade
    processed_audio = AudioSegment.silent(duration=0)
    for start_i, end_i in nonsilent_chunks:
        chunk = audio[start_i:end_i]
        if len(processed_audio) > 0:
            processed_audio = processed_audio.append(chunk, crossfade=crossfade_len)
        else:
            processed_audio = chunk
        processed_audio += AudioSegment.silent(duration=silence_chunk_len)

    # Export the processed audio
    processed_audio.export(output_path, format='mp3')


### First identify merged files

In [None]:
def identify_merged_files(folder_path, gap_threshold=15, start_word_cooldown=15):
    """
    Identify audio files that are likely merged from multiple conversations.

    Args:
    folder_path (str): Path to the folder containing audio files.
    gap_threshold (int): Time threshold in seconds to identify gaps indicating separate conversations.

    Returns:
    list of tuples: Each tuple contains the file path and a list of timestamps where merges likely occur.
    """

    merged_files = []
    folder_path = Path(folder_path)
    
    for audio_file in folder_path.rglob('*.json'):

        transcription_data = read_transcription_data(audio_file)
        merged_details = []
        last_end_conversation_time = None
        last_start_conversation_time = None

        for segment in transcription_data:
            segment_text = segment['text'].lower().strip()
            segment_start = float(segment['start'])

            # Check for end-of-conversation words
            for end_word in END_OF_CONV_WORDS:
                if re.search(r'\b' + re.escape(end_word) + r'\b', segment_text):
                    last_end_conversation_time = segment_start
                    break

            for start_word in START_OF_CONV_WORDS:
                if re.search(r'\b' + re.escape(start_word) + r'\b', segment_text):
                    if segment_start < 2:  # assuming 5 seconds as a threshold for the beginning
                        continue
                    time_since_last_end = segment_start - last_end_conversation_time if last_end_conversation_time is not None else float('inf')
                    time_since_last_start = segment_start - last_start_conversation_time if last_start_conversation_time is not None else float('inf')
                    if time_since_last_end <= gap_threshold and time_since_last_start > start_word_cooldown:
                        print(audio_file, time_since_last_end, time_since_last_start)
                        merged_details.append((segment_start, start_word))
                        last_start_conversation_time = segment_start
                        break

        if merged_details:
            merged_files.append((str(audio_file), [(detail[0], detail[1]) for detail in merged_details]))

    if merged_files:
        merged_files_folder = folder_path.parent / "merged_files"
        merged_files_folder.mkdir(exist_ok=True)
        for merged_file in merged_files:
            source_path = Path(merged_file[0]).parent
            dest_path = merged_files_folder / Path(merged_file[0]).parent.name
            if not dest_path.exists():
                shutil.copytree(source_path, dest_path)

    return merged_files

In [None]:
merged_files = identify_merged_files(transcription_folder)
print(len(merged_files))
merged_files

### Cut and save audio files

In [None]:
def cut_and_save_audio_files(splited_files_folder, merged_files):
    """
    Process each audio file to cut and save segments based on specified cutting times.

    Args:
    raw_audio_folder (str): Path to the folder containing raw audio files.
    merged_files (list of tuples): Each tuple contains the file path and a list of cutting times.

    This function processes each specified audio file, cutting it into segments at the specified times, and saves these segments as new audio files.
    """
    splited_files_folder.mkdir(exist_ok=True)

    for file_path, cutting_times in merged_files:
        file_path = Path(file_path)
        y, sr = load_audio(file_path, sr=None)

        # Initial start sample for the first segment
        prev_cut_sample = 0
        original_stem = file_path.stem

        for _, (cut_time, _) in enumerate(cutting_times):
            cut_sample = int(cut_time * sr)
            audio_segment = split_audio(file_path, prev_cut_sample, cut_sample, sr)

            # Determine new file name and title
            audio = MP3(file_path)
            title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
            start_timestamp, end_timestamp = parse_timestamp_from_title(title)
            new_start_timestamp = start_timestamp + pd.to_timedelta(prev_cut_sample / sr, unit='s')
            new_end_timestamp = start_timestamp + pd.to_timedelta(cut_sample / sr, unit='s')
            new_file_name = format_filename(new_start_timestamp, original_stem)
            new_title = f"{new_start_timestamp.strftime('%d/%m/%Y %H:%M:%S')} - {new_end_timestamp.strftime('%d/%m/%Y %H:%M:%S')}"

            # Save the audio segment
            segment_file_path = splited_files_folder / new_file_name
            save_audio_segment(segment_file_path, audio_segment, sr)

            # Update metadata for the segment
            copy_metadata(file_path, segment_file_path, new_title)

            prev_cut_sample = cut_sample

        # Handle the last segment from the last cut to the end of the file
        last_segment = split_audio(file_path, prev_cut_sample, len(y), sr)
        new_start_timestamp = start_timestamp + pd.to_timedelta(prev_cut_sample / sr, unit='s')
        new_file_name = format_filename(new_start_timestamp, original_stem)
        last_segment_file_path = splited_files_folder / new_file_name
        new_title = f"{new_start_timestamp.strftime('%d/%m/%Y %H:%M:%S')} - {end_timestamp.strftime('%d/%m/%Y %H:%M:%S')}"
        save_audio_segment(last_segment_file_path, last_segment, sr)

        # Update metadata for the last segment
        copy_metadata(file_path, last_segment_file_path, new_title)

        # Optionally delete the original file's transcription folder
        # shutil.rmtree(file_path.parent)

In [None]:
splited_folder = DATA_FOLDER / 'splited_files'
cut_and_save_audio_files(splited_folder, merged_files)

### Now we need to rerun transcribe_all on those new files

In [None]:
!python .\transcribe_all.py --source_folder "..\splited_files" --dest_folder "..\splited_files" --move

We now have a splited_files containing unmerged files, the next step is to identify files belonging to the same conversation and fuse them

### Then create audio dataFrame

In [4]:
def check_for_problematic_end(transcription_data, words, end_time, density_threshold=0.2, diversity_threshold=0.5):
    # Find time of the last occurrence of the end_word
    last_end_word_time = None
    for segment in reversed(transcription_data):
        if any(re.search(r'\b' + re.escape(end_word) + r'\b', segment['text'].lower()) for end_word in words):
            last_end_word_time = segment['end']
            break

    if last_end_word_time is None:
        return False

    # Analyze word density and diversity after the last end word
    remaining_segments = [seg for seg in transcription_data if seg['start'] >= last_end_word_time]
    total_words = sum(len(seg['text'].split()) for seg in remaining_segments)
    unique_words = len(set(word for seg in remaining_segments for word in seg['text'].split()))

    remaining_time = end_time - last_end_word_time
    word_density = total_words / remaining_time if remaining_time > 0 else 0
    word_diversity = unique_words / total_words if total_words > 0 else 1

    return word_density > density_threshold or word_diversity < diversity_threshold

def create_audio_database(audio_files):
    """
    Create a database of audio files with metadata and transcription data.

    Args:
    folder_path (str): Path to the folder containing audio files and their transcription data.

    Returns:
    pandas.DataFrame: A DataFrame containing metadata and transcription data for each audio file.
    """
    data = []

    for audio_file in audio_files:
        try:
            audio = MP3(audio_file)
            title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
            start_timestamp, end_timestamp = parse_timestamp_from_title(title)
            audio_length = audio.info.length

            transcription_path = audio_file.with_stem(audio_file.stem + '_transcription').with_suffix('.txt')
            transcription_data_path = audio_file.with_name(audio_file.stem + '_segments_data.json')
            transcription_data = read_transcription_data(transcription_data_path)

            is_start_file = check_word_in_timeframe(transcription_data, START_OF_CONV_WORDS, 0, 10)
            
            is_end_file = check_word_in_timeframe(transcription_data, END_OF_CONV_WORDS, max(0, audio_length - 10), audio_length, is_end_segment=True)
            is_problematic_end = check_for_problematic_end(transcription_data, END_OF_CONV_WORDS, audio_length, density_threshold=0.25, diversity_threshold=0.5)
            is_end_file = is_end_file or is_problematic_end

            is_complete = is_start_file and is_end_file

            # If the file is complete, set is_start_file and is_end_file to False
            if is_complete:
                is_start_file = False
                is_end_file = False

            data.append({
                "File Name": audio_file.stem,
                "File Path": str(audio_file),
                "Transcription Path": str(transcription_path) if transcription_path.exists() else None,
                "Start Timestamp": start_timestamp,
                "End Timestamp": end_timestamp,
                "Audio Length": audio_length,
                "Is End File": is_end_file,
                "Is Start File": is_start_file,
                "Is Complete": is_complete,
                "Precedent File": None,
                "Next File": None
            })
        except Exception as e:
            print(f"Error processing file {audio_file}: {e}")

    return pd.DataFrame(data)


In [5]:
# Usage

# if merged_files:
#     merged_files_path_list = [Path(file_path).parent for file_path, _ in merged_files]
#     audio_files = [audio_file for audio_file in transcription_folder.rglob('*.mp3') if audio_file.parent not in merged_files_path_list] \
#                 + [audio_file for audio_file in splited_folder.rglob('*.mp3')]    
# else:
#     audio_files = [audio_file for audio_file in transcription_folder.rglob('*.mp3') ] \
#                 + [audio_file for audio_file in splited_folder.rglob('*.mp3')]

folder = Path("../audio_database/test_audio_processing/processed/")
audio_files = [audio_file for audio_file in folder.rglob('*.mp3') ]

df = create_audio_database(audio_files)
df

Unnamed: 0,File Name,File Path,Transcription Path,Start Timestamp,End Timestamp,Audio Length,Is End File,Is Start File,Is Complete,Precedent File,Next File
0,2023_1_15_20_27_15_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 20:27:15,2023-01-15 20:27:37,22.716,False,True,False,,
1,2023_1_15_20_27_38_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 20:27:38,2023-01-15 20:28:18,40.104,False,False,False,,
2,2023_1_15_20_58_32_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 20:58:32,2023-01-15 21:00:33,120.816,False,True,False,,
3,2023_1_15_21_0_34_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 21:00:34,2023-01-15 21:02:41,126.972,True,False,False,,
4,2023_1_15_21_34_9_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 21:34:09,2023-01-15 21:35:17,67.572,False,False,False,,
5,2023_1_15_21_35_18_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 21:35:18,2023-01-15 21:35:24,6.36,True,False,False,,
6,2023_1_15_21_35_24_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 21:35:24,2023-01-15 21:37:50,146.23,False,False,True,,
7,2023_1_15_21_37_50_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 21:37:50,2023-01-15 21:39:00,70.21,False,False,True,,
8,2023_1_15_21_39_0_ch30,..\audio_database\test_audio_processing\proces...,..\audio_database\test_audio_processing\proces...,2023-01-15 21:39:00,2023-01-15 21:40:12,71.248,False,False,True,,


In [None]:
df[(df['File Name'] == "2022_1_4_0_2_37_ch25") | (df['File Name'] == "2022_1_4_0_4_15_ch25")]

### Regroup associated files

In [None]:
def find_associated_files(df, time_delta=20):
    """
    Identify groups of audio files that are associated based on their timestamps and the status of the file (start of conversation, end of conversation, neither).

    Args:
    df (pandas.DataFrame): A DataFrame containing audio file metadata, including start and end timestamps.
    time_delta (int): The maximum time difference in seconds between the end of one file and the start of another for them to be considered associated.

    Returns:
    list of tuples: Each tuple contains file paths of associated audio files. The files in each tuple are considered part of the same conversation or related sequence.
    
    This function analyzes the start and end timestamps of audio files to group them into conversations or sequences based on the specified time delta.
    """
    associated_files = []

    # Filter DataFrame for start, other, and end files
    start_files = df[df['Is Start File']]
    other_files = df[~df['Is End File'] & ~df['Is Start File'] & ~df['Is Complete']]
    end_files = df[df['Is End File']]

    for idx, start_row in start_files.iterrows():
        end_time_beginning = start_row['End Timestamp']
        if pd.isnull(end_time_beginning):
            continue

        # First check in other_files
        other_file_found = None
        for _, other_row in other_files.iterrows():
            if start_row.iloc[0].split('_')[-1] != other_row.iloc[0].split('_')[-1]:
                continue
            start_time_other = other_row['Start Timestamp']
            if pd.isnull(start_time_other):
                continue

            if 0 <= (start_time_other - end_time_beginning).total_seconds() <= time_delta:
                other_file_found = other_row
                break

        # If an other_file is found, check for a corresponding file in file_ends
        end_file_found = None
        if other_file_found is not None:
            end_time_other = other_file_found['End Timestamp']
            for _, end_row in end_files.iterrows():
                if start_row.iloc[0].split('_')[-1] != end_row.iloc[0].split('_')[-1]:
                    continue
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_other).total_seconds() <= time_delta:
                    end_file_found = end_row
                    break

            if end_file_found is not None:
                associated_files.append((start_row['File Path'], other_file_found['File Path'], end_file_found['File Path']))
            else:
                associated_files.append((start_row['File Path'], other_file_found['File Path']))

        # If no other_file is found, check directly in file_ends
        elif other_file_found is None:
            for _, end_row in end_files.iterrows():
                if start_row.iloc[0].split('_')[-1] != end_row.iloc[0].split('_')[-1]:
                    continue
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_beginning).total_seconds() <= time_delta:
                    associated_files.append((start_row['File Path'], end_row['File Path']))
                    break

    return associated_files

associated_files = find_associated_files(df)
for files in associated_files:
    print(f"Associated files: {', '.join(files)}")
print(len(associated_files))

In [None]:
import os
from shutil import copy2

def merge_audio_segments(audio_tuples, output_folder, segment_length=5):
    for tuple_index, audio_tuple in enumerate(audio_tuples):
        # Create a subfolder for each merged audio
        subfolder_path = Path(output_folder) / f"merged_audio_{tuple_index}"
        subfolder_path.mkdir(exist_ok=True, parents=True)

        for i in range(len(audio_tuple) - 1):
            audio_file_path = Path(audio_tuple[i])
            next_audio_file_path = Path(audio_tuple[i + 1])

            if not audio_file_path.exists() or not next_audio_file_path.exists():
                print(f"File not found: {audio_file_path} or {next_audio_file_path}")
                continue

            # Load the current and next audio files using librosa
            current_audio, sr_current = librosa.load(audio_file_path, sr=None)
            next_audio, sr_next = librosa.load(next_audio_file_path, sr=None)

            # Ensure both files have the same sample rate
            if sr_current != sr_next:
                raise ValueError("Sample rates do not match")

            # Convert segment length from seconds to samples
            segment_length_samples = segment_length * sr_current

            # Extract segments
            current_audio_segment = current_audio[-min(segment_length_samples, len(current_audio)):]
            next_audio_segment = next_audio[:min(segment_length_samples, len(next_audio))]

            # Merge the segments
            merged_segment = np.concatenate((current_audio_segment, next_audio_segment))

            copy2(audio_tuple[i], subfolder_path)
            copy2(audio_tuple[i + 1], subfolder_path)

            # Save the merged segment in the subfolder using soundfile
            merged_output_path = os.path.join(subfolder_path, f"merged_segment_{i}.wav")
            sf.write(merged_output_path, merged_segment, sr_current)


merge_audio_segments(associated_files, DATA_FOLDER / 'test')

### Copy grouped files and merge the audio

In [None]:
def merge_associated_files(associated_files, df, fused_audio_folder):
    """
    Merge associated audio files and save it on the raw_audio_folder

    Args:
    associated_files (list of tuples): Each tuple contains file paths of associated audio files.
    df (pandas.DataFrame): A DataFrame containing audio file metadata.
    raw_audio_folder (str): Path to the folder where the grouped and merged audio files will be stored.

    This function processes each group of associated files, merges their audio content, and saves the merged audio in the specified folder. It also updates the metadata for the merged files based on the group's start and end timestamps.
    """

    fused_audio_folder.mkdir(exist_ok=True)

    for group in associated_files:
        # Create a unique folder for each group

        audio_files = []
        start_timestamp = None
        end_timestamp = None

        for file_path in group:        
            # Add file to list for merging
            audio_files.append(file_path)

            # Extract timestamps for merged file metadata
            file_info = df[df['File Path'] == file_path].iloc[0]
            if start_timestamp is None or file_info['Start Timestamp'] < start_timestamp:
                start_timestamp = file_info['Start Timestamp']
            if end_timestamp is None or file_info['End Timestamp'] > end_timestamp:
                end_timestamp = file_info['End Timestamp']

            # Delete original audio
            # original_audio = raw_audio_folder / Path(file_path).name
            # original_audio.unlink()

        # Merge and save audio
        fused_audio, sample_rate = merge_audios(audio_files)
        fused_file_path = fused_audio_folder / Path(audio_files[0]).name
        save_audio_segment(fused_file_path, fused_audio, sample_rate)
        
        # Copy metadata from the beginning file to the merged file
        start_timestamp = start_timestamp.strftime("%d/%m/%Y %H:%M:%S")
        end_timestamp = end_timestamp.strftime("%d/%m/%Y %H:%M:%S")

        new_title = f"{start_timestamp} - {end_timestamp}"
        copy_metadata(group[0], fused_file_path, new_title)

        # Delete old transcription folder
        # for file_path in group:
        #     shutil.rmtree(Path(file_path).parent)

In [None]:
# Usage
fused_audio_folder = DATA_FOLDER / "fused_files/"
merge_associated_files(associated_files, df, fused_audio_folder)

### Rerun again transcribe_all on the final new files

In [None]:
!python .\transcribe_all.py --source_folder "C:\Users\Thib\PycharmProjects\ECHO\audio_database\raw_mp3_test\" --dest_folder "C:\Users\Thib\PycharmProjects\ECHO\audio_database\raw_mp3_test_transcription\"

In [None]:
## Copy to processed folder

processed_files = DATA_FOLDER / "processed_files"
processed_files.mkdir(exist_ok=True)

associated_files_list = [Path(path).stem for path_tuple in associated_files for path in path_tuple]
transcription_folder_list = list(transcription_folder.iterdir())
fused_folder_list = list(fused_audio_folder.iterdir())
splited_folder_list = list(splited_folder.iterdir())

for folder in fused_folder_list:
    shutil.copytree(folder, processed_files / folder.name)
    
for folder in splited_folder_list:
    if folder.name not in associated_files_list:
        shutil.copytree(folder, processed_files / folder.name)

for folder in transcription_folder_list:
    if folder.name not in associated_files_list:
        try:
            shutil.copytree(folder, processed_files / folder.name)
        except FileExistsError as e:
            continue                          
