In [1]:
from mutagen.mp3 import MP3
from pathlib import Path
import soundfile as sf
import pandas as pd
from utils import *
import librosa
import shutil
import json
import re

### First identify merged files

In [28]:
def read_transcription_data(file_path):
    file_path = Path(file_path)
    transcription_file = file_path.with_stem(file_path.stem + '_segments_data').with_suffix('.json')
    if transcription_file.exists():
        with open(transcription_file, 'r') as f:
            return json.load(f)
    return []

def identify_merged_files(folder_path, gap_threshold=30):
    merged_files = []
    folder_path = Path(folder_path)
    end_conversation_words = ["au revoir", "bon courage", "bonne soirée", "bonne jounrée"]

    for audio_file in folder_path.rglob('*.mp3'):
        transcription_data = read_transcription_data(audio_file)
        merged_timestamps = []
        last_bonjour_time = -gap_threshold  # Initialize to a value outside the gap threshold
        end_conversation_detected = False

        for segment in transcription_data:
            segment_text = segment['text'].lower().strip()
            segment_start = float(segment['start'])

            # Check for end-of-conversation words
            if any(re.search(r'\b' + re.escape(end_word) + r'\b', segment_text) for end_word in end_conversation_words):
                end_conversation_detected = True

            # Check for "bonjour" occurrences
            if any(word in segment_text for word in ['bonjour', "bon jour"]):
                if (segment_start > 10 or (segment_start <= 10 and end_conversation_detected)) and (segment_start - last_bonjour_time > gap_threshold):
                    merged_timestamps.append(segment_start)
                    last_bonjour_time = segment_start
                end_conversation_detected = False  # Reset for next segments

        if merged_timestamps:
            merged_files.append((str(audio_file), merged_timestamps))

    return merged_files

In [29]:
folder_path = "../audio_database/raw_mp3_test_transcription/"
merged_files = identify_merged_files(folder_path)
merged_files

[('..\\audio_database\\raw_mp3_test_transcription\\2023_1_15_21_35_18_ch30\\2023_1_15_21_35_18_ch30.mp3',
  [6.36, 152.59, 222.8]),
 ('..\\audio_database\\raw_mp3_test_transcription\\2023_1_9_7_27_59_ch30\\2023_1_9_7_27_59_ch30.mp3',
  [107.23])]

### Cut and save audio files

In [42]:
def split_audio(file_path, start_sample, end_sample, sr):
    """ Split the audio file at the specified sample range """
    y, _ = librosa.load(file_path, sr=None, offset=start_sample/sr, duration=(end_sample-start_sample)/sr)
    return y

def format_filename(timestamp, original_stem):
    file_name_components = [
        str(timestamp.year),
        str(timestamp.month),
        str(timestamp.day),
        str(timestamp.hour),
        str(timestamp.minute),
        str(timestamp.second),
        original_stem.split('_')[-1]
    ]
    file_name = '_'.join(file_name_components)

    return f"{file_name}.mp3"

def cut_and_save_audio_files(raw_audio_folder, merged_files):
    raw_audio_folder = Path(raw_audio_folder)

    for file_path, cutting_times in merged_files:
        file_path = Path(file_path)
        y, sr = librosa.load(file_path, sr=None)

        # Initial start sample for the first segment
        prev_cut_sample = 0
        original_stem = file_path.stem

        for _, cut_time in enumerate(cutting_times):
            cut_sample = int(cut_time * sr)
            audio_segment = split_audio(file_path, prev_cut_sample, cut_sample, sr)

            # Determine new file name and title
            audio = MP3(file_path)
            title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
            start_timestamp, end_timestamp = parse_timestamp_from_title(title)
            new_start_timestamp = start_timestamp + pd.to_timedelta(prev_cut_sample / sr, unit='s')
            new_end_timestamp = start_timestamp + pd.to_timedelta(cut_sample / sr, unit='s')
            new_file_name = format_filename(new_start_timestamp, original_stem)
            new_title = f"{new_start_timestamp.strftime('%d/%m/%Y %H:%M:%S')} - {new_end_timestamp.strftime('%d/%m/%Y %H:%M:%S')}"

            # Save the audio segment
            segment_file_path = raw_audio_folder / new_file_name
            sf.write(segment_file_path, audio_segment, sr)

            # Update metadata for the segment
            copy_metadata(file_path, segment_file_path, new_title)

            prev_cut_sample = cut_sample

        # Handle the last segment from the last cut to the end of the file
        last_segment = split_audio(file_path, prev_cut_sample, len(y), sr)
        new_start_timestamp = start_timestamp + pd.to_timedelta(prev_cut_sample / sr, unit='s')
        new_file_name = format_filename(new_start_timestamp, original_stem)
        last_segment_file_path = raw_audio_folder / new_file_name
        new_title = f"{new_start_timestamp.strftime('%d/%m/%Y %H:%M:%S')} - {end_timestamp.strftime('%d/%m/%Y %H:%M:%S')}"
        sf.write(last_segment_file_path, last_segment, sr)

        # Update metadata for the last segment
        copy_metadata(file_path, last_segment_file_path, new_title)

        # Optionally delete the original file's transcription folder
        shutil.rmtree(file_path.parent)

In [43]:
raw_audio_folder = "../audio_database/raw_mp3_test/"
cut_and_save_audio_files(raw_audio_folder, merged_files)

### Now we need to rerun transcribe_all on those new files

In [None]:
!python .\transcribe_all.py --source_folder "C:\Users\Thib\PycharmProjects\ECHO\audio_database\raw_mp3_test\" --dest_folder "C:\Users\Thib\PycharmProjects\ECHO\audio_database\raw_mp3_test_transcription\"

We now have a raw_mp3_database containing unmerged file, the next step is to identify files belonging to the same conversation and fuse them

### Then create audio dataFrame

In [53]:
def check_word_in_timeframe(transcription_data, words, start_time, end_time):
    for segment in transcription_data:
        segment_text = segment['text'].lower()
        segment_start = segment['start']
        if start_time <= segment_start <= end_time:
            if any(re.search(r'\b' + re.escape(word) + r'\b', segment_text) for word in words):
                return True
    return False

# def create_audio_database(folder_path):
#     folder_path = Path(folder_path)
#     data = []

#     for audio_file in folder_path.rglob('*.mp3'):
#         try:
#             audio = MP3(audio_file)
#             title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
#             start_timestamp, end_timestamp = parse_timestamp_from_title(title)
#             audio_length = audio.info.length

#             transcription_path = audio_file.with_stem(audio_file.stem + '_transcription').with_suffix('.txt')
#             # words_found = check_words_in_transcription(transcription_path, ["bonjour", "au revoir"])

#             transcription_data = read_transcription_data(audio_file)

#             is_start_file = check_word_in_timeframe(transcription_data, "bonjour", 0, 10)
#             is_end_file = check_word_in_timeframe(transcription_data, "au revoir", max(0, audio_length - 10), audio_length)
#             is_complete = is_start_file and is_end_file

#             data.append({
#                 "File Name": audio_file.stem,
#                 "File Path": str(audio_file),
#                 "Transcription Path": str(transcription_path) if transcription_path.exists() else None,
#                 "Start Timestamp": start_timestamp,
#                 "End Timestamp": end_timestamp,
#                 "Audio Length": audio_length,
#                 "Is End File": is_end_file,
#                 "Is Start File": is_start_file,
#                 "Is Complete": is_complete,
#                 "Precedent File": None,
#                 "Next File": None
#             })
#         except Exception as e:
#             print(f"Error processing file {audio_file}: {e}")

#     return pd.DataFrame(data)

def create_audio_database(folder_path):
    folder_path = Path(folder_path)
    data = []
    end_conversation_words = ["au revoir", "bon courage", "bonne soirée", "bonne journée"]

    for audio_file in folder_path.rglob('*.mp3'):
        try:
            audio = MP3(audio_file)
            title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
            start_timestamp, end_timestamp = parse_timestamp_from_title(title)
            audio_length = audio.info.length

            transcription_path = audio_file.with_stem(audio_file.stem + '_transcription').with_suffix('.txt')
            transcription_data = read_transcription_data(audio_file)

            is_start_file = check_word_in_timeframe(transcription_data, ["bonjour", "bon jour"], 0, 10)
            is_end_file = check_word_in_timeframe(transcription_data, end_conversation_words, max(0, audio_length - 10), audio_length)
            is_complete = is_start_file and is_end_file

            # If the file is complete, set is_start_file and is_end_file to False
            if is_complete:
                is_start_file = False
                is_end_file = False

            data.append({
                "File Name": audio_file.stem,
                "File Path": str(audio_file),
                "Transcription Path": str(transcription_path) if transcription_path.exists() else None,
                "Start Timestamp": start_timestamp,
                "End Timestamp": end_timestamp,
                "Audio Length": audio_length,
                "Is End File": is_end_file,
                "Is Start File": is_start_file,
                "Is Complete": is_complete,
                "Precedent File": None,
                "Next File": None
            })
        except Exception as e:
            print(f"Error processing file {audio_file}: {e}")

    return pd.DataFrame(data)


In [54]:
# Usage
folder_path = "../audio_database/raw_mp3_test_transcription/"
df = create_audio_database(folder_path)
df.head()

Unnamed: 0,File Name,File Path,Transcription Path,Start Timestamp,End Timestamp,Audio Length,Is End File,Is Start File,Is Complete,Precedent File,Next File
0,2023_1_15_10_5_41_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 10:05:41,2023-01-15 10:09:41,240.408,False,False,True,,
1,2023_1_15_18_40_13_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 18:40:13,2023-01-15 18:41:13,59.76,False,True,False,,
2,2023_1_15_18_59_42_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 18:59:42,2023-01-15 19:02:05,141.66,False,False,True,,
3,2023_1_15_20_11_51_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 20:11:51,2023-01-15 20:14:19,147.486,False,False,True,,
4,2023_1_15_20_14_19_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 20:14:19,2023-01-15 20:14:55,35.142,False,True,False,,


### Regroup associated files

In [55]:
def find_associated_files(df, time_delta=20):
    associated_files = []

    # Filter DataFrame for start, other, and end files
    start_files = df[df['Is Start File']]
    other_files = df[~df['Is End File'] & ~df['Is Start File']]
    end_files = df[df['Is End File']]

    for idx, start_row in start_files.iterrows():
        end_time_beginning = start_row['End Timestamp']
        if pd.isnull(end_time_beginning):
            continue

        # First check in other_files
        other_file_found = None
        for _, other_row in other_files.iterrows():
            start_time_other = other_row['Start Timestamp']
            if pd.isnull(start_time_other):
                continue

            if 0 <= (start_time_other - end_time_beginning).total_seconds() <= time_delta:
                other_file_found = other_row
                break

        # If an other_file is found, check for a corresponding file in file_ends
        end_file_found = None
        if other_file_found is not None:
            end_time_other = other_file_found['End Timestamp']
            for _, end_row in end_files.iterrows():
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_other).total_seconds() <= time_delta:
                    end_file_found = end_row
                    break

            if end_file_found is not None:
                associated_files.append((start_row['File Path'], other_file_found['File Path'], end_file_found['File Path']))
            else:
                associated_files.append((start_row['File Path'], other_file_found['File Path']))

        # If no other_file is found, check directly in file_ends
        elif other_file_found is None:
            for _, end_row in end_files.iterrows():
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_beginning).total_seconds() <= time_delta:
                    associated_files.append((start_row['File Path'], end_row['File Path']))
                    break

    return associated_files

associated_files = find_associated_files(df)
for files in associated_files:
    print(f"Associated files: {', '.join(files)}")

### Copy grouped files and merge the audio

In [13]:
def copy_associated_files_to_group_folder(associated_files, df, raw_audio_folder):
    raw_audio_folder = Path(raw_audio_folder)

    for group in associated_files:
        # Create a unique folder for each group

        audio_files = []
        start_timestamp = None
        end_timestamp = None

        for file_path in group:        
            # Add file to list for merging
            audio_files.append(file_path)

            # Extract timestamps for merged file metadata
            file_info = df[df['File Path'] == file_path].iloc[0]
            if start_timestamp is None or file_info['Start Timestamp'] < start_timestamp:
                start_timestamp = file_info['Start Timestamp']
            if end_timestamp is None or file_info['End Timestamp'] > end_timestamp:
                end_timestamp = file_info['End Timestamp']

            # Delete original audio
            original_audio = raw_audio_folder / Path(file_path).name
            original_audio.unlink()

        # Merge and save audio
        merged_audio, sample_rate = merge_audios(audio_files)
        merged_file_path = raw_audio_folder / Path(audio_files[0]).name
        sf.write(merged_file_path, merged_audio, sample_rate)
        
        # Copy metadata from the beginning file to the merged file
        start_timestamp = start_timestamp.strftime("%d/%m/%Y %H:%M:%S")
        end_timestamp = end_timestamp.strftime("%d/%m/%Y %H:%M:%S")

        new_title = f"{start_timestamp} - {end_timestamp}"
        copy_metadata(group[0], merged_file_path, new_title)

        # Delete old transcription folder
        for file_path in group:
            shutil.rmtree(Path(file_path).parent)

In [14]:
# Usage
raw_audio_folder = "../audio_database/raw_mp3_test/"
copy_associated_files_to_group_folder(associated_files, df, raw_audio_folder)

### Rerun again transcribe_all on the final new files

In [None]:
!!python .\transcribe_all.py --source_folder "C:\Users\Thib\PycharmProjects\ECHO\audio_database\raw_mp3_test\" --dest_folder "C:\Users\Thib\PycharmProjects\ECHO\audio_database\raw_mp3_test_transcription\"