In [1]:
from mutagen.mp3 import MP3
from pathlib import Path
import soundfile as sf
import pandas as pd
from utils import *
import shutil

### Create audio dataFrame

In [2]:
def create_audio_database(folder_path):
    folder_path = Path(folder_path)
    data = []

    for audio_file in folder_path.rglob('*.mp3'):
        try:
            audio = MP3(audio_file)
            title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
            start_timestamp, end_timestamp = parse_timestamp_from_title(title)
            audio_length = audio.info.length

            transcription_path = audio_file.with_suffix('.txt')
            words_found = check_words_in_transcription(transcription_path, ["bonjour", "au revoir"])

            is_end_file = "au revoir" in words_found and "bonjour" not in words_found
            is_start_file = "bonjour" in words_found and "au revoir" not in words_found
            is_complete = "bonjour" in words_found and "au revoir" in words_found

            data.append({
                "File Name": audio_file.stem,
                "File Path": str(audio_file),
                "Transcription Path": str(transcription_path) if transcription_path.exists() else None,
                "Start Timestamp": start_timestamp,
                "End Timestamp": end_timestamp,
                "Audio Length": audio_length,
                "Is End File": is_end_file,
                "Is Start File": is_start_file,
                "Is Complete": is_complete,
                "Precedent File": None,
                "Next File": None
            })
        except Exception as e:
            print(f"Error processing file {audio_file}: {e}")

    return pd.DataFrame(data)

In [3]:
# Usage
folder_path = "../audio_database/raw_mp3_with_transcription/"
df = create_audio_database(folder_path)
df.head()

Unnamed: 0,File Name,File Path,Transcription Path,Start Timestamp,End Timestamp,Audio Length,Is End File,Is Start File,Is Complete,Precedent File,Next File
0,2023_1_15_10_4_22_ch30,..\audio_database\raw_mp3_with_transcription\2...,..\audio_database\raw_mp3_with_transcription\2...,2023-01-15 10:04:22,2023-01-15 10:05:40,78.012,False,False,True,,
1,2023_1_15_10_5_41_ch30,..\audio_database\raw_mp3_with_transcription\2...,..\audio_database\raw_mp3_with_transcription\2...,2023-01-15 10:05:41,2023-01-15 10:08:23,161.964,False,True,False,,
2,2023_1_15_10_8_23_ch30,..\audio_database\raw_mp3_with_transcription\2...,..\audio_database\raw_mp3_with_transcription\2...,2023-01-15 10:08:23,2023-01-15 10:09:41,78.444,True,False,False,,
3,2023_1_15_18_40_13_ch30,..\audio_database\raw_mp3_with_transcription\2...,..\audio_database\raw_mp3_with_transcription\2...,2023-01-15 18:40:13,2023-01-15 18:41:01,48.312,False,True,False,,
4,2023_1_15_18_41_2_ch30,..\audio_database\raw_mp3_with_transcription\2...,..\audio_database\raw_mp3_with_transcription\2...,2023-01-15 18:41:02,2023-01-15 18:41:13,11.448,False,False,False,,


### Regroup associated files

In [4]:
def find_associated_files(df, time_delta=20):
    associated_files = []

    # Filter DataFrame for start, other, and end files
    start_files = df[df['Is Start File']]
    other_files = df[~df['Is End File'] & ~df['Is Start File']]
    end_files = df[df['Is End File']]

    for idx, start_row in start_files.iterrows():
        end_time_beginning = start_row['End Timestamp']
        if pd.isnull(end_time_beginning):
            continue

        # First check in other_files
        other_file_found = None
        for _, other_row in other_files.iterrows():
            start_time_other = other_row['Start Timestamp']
            if pd.isnull(start_time_other):
                continue

            if 0 <= (start_time_other - end_time_beginning).total_seconds() <= time_delta:
                other_file_found = other_row
                break

        # If an other_file is found, check for a corresponding file in file_ends
        end_file_found = None
        if other_file_found is not None:
            end_time_other = other_file_found['End Timestamp']
            for _, end_row in end_files.iterrows():
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_other).total_seconds() <= time_delta:
                    end_file_found = end_row
                    break

            if end_file_found is not None:
                associated_files.append((start_row['File Path'], other_file_found['File Path'], end_file_found['File Path']))
            else:
                associated_files.append((start_row['File Path'], other_file_found['File Path']))

        # If no other_file is found, check directly in file_ends
        elif other_file_found is None:
            for _, end_row in end_files.iterrows():
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_beginning).total_seconds() <= time_delta:
                    associated_files.append((start_row['File Path'], end_row['File Path']))
                    break

    return associated_files

associated_files = find_associated_files(df)
for files in associated_files:
    print(f"Associated files: {', '.join(files)}")

Associated files: ..\audio_database\raw_mp3_with_transcription\2023_1_15_10_5_41_ch30\2023_1_15_10_5_41_ch30.mp3, ..\audio_database\raw_mp3_with_transcription\2023_1_15_10_8_23_ch30\2023_1_15_10_8_23_ch30.mp3
Associated files: ..\audio_database\raw_mp3_with_transcription\2023_1_15_18_40_13_ch30\2023_1_15_18_40_13_ch30.mp3, ..\audio_database\raw_mp3_with_transcription\2023_1_15_18_41_2_ch30\2023_1_15_18_41_2_ch30.mp3
Associated files: ..\audio_database\raw_mp3_with_transcription\2023_1_15_18_59_42_ch30\2023_1_15_18_59_42_ch30.mp3, ..\audio_database\raw_mp3_with_transcription\2023_1_15_19_1_32_ch30\2023_1_15_19_1_32_ch30.mp3
Associated files: ..\audio_database\raw_mp3_with_transcription\2023_1_15_20_11_51_ch30\2023_1_15_20_11_51_ch30.mp3, ..\audio_database\raw_mp3_with_transcription\2023_1_15_20_12_51_ch30\2023_1_15_20_12_51_ch30.mp3
Associated files: ..\audio_database\raw_mp3_with_transcription\2023_1_15_20_24_16_ch30\2023_1_15_20_24_16_ch30.mp3, ..\audio_database\raw_mp3_with_transcrip

### Copy grouped files and merge the audio

In [5]:
def copy_associated_files_to_group_folder(associated_files, df, base_folder):
    base_folder = Path(base_folder)
    group_number = 1

    for group in associated_files:
        # Create a unique folder for each group
        group_folder = base_folder / f"group_{group_number}"
        group_folder.mkdir(parents=True, exist_ok=True)

        audio_files = []
        start_timestamp = None
        end_timestamp = None

        for file_path in group:
            # Copy each file in the group to the group folder
            file_name = Path(file_path).name
            destination = group_folder / file_name
            shutil.copy(file_path, destination)

            # Add file to list for merging
            audio_files.append(file_path)

            # Extract timestamps for merged file metadata
            file_info = df[df['File Path'] == file_path].iloc[0]
            if start_timestamp is None or file_info['Start Timestamp'] < start_timestamp:
                start_timestamp = file_info['Start Timestamp']
            if end_timestamp is None or file_info['End Timestamp'] > end_timestamp:
                end_timestamp = file_info['End Timestamp']

        # Merge and save audio
        merged_audio, sample_rate = merge_audios(audio_files)
        merged_file_name = Path(group[0]).stem + "_merged.mp3"
        merged_file_path = group_folder / merged_file_name
        sf.write(merged_file_path, merged_audio, sample_rate)
        
        # Copy metadata from the beginning file to the merged file
        start_timestamp = start_timestamp.strftime("%d/%m/%Y %H:%M:%S")
        end_timestamp = end_timestamp.strftime("%d/%m/%Y %H:%M:%S")

        new_title = f"{start_timestamp} - {end_timestamp}"
        copy_metadata(group[0], merged_file_path, new_title)

        group_number += 1

# Usage
base_folder = "./audio_regrouping/"
copy_associated_files_to_group_folder(associated_files, df, base_folder)