In [40]:
from mutagen.mp3 import MP3
from pathlib import Path
import soundfile as sf
import pandas as pd
from utils import *
import librosa
import shutil
import json

### First identify merged files

In [41]:
def read_transcription_data(file_path):
    file_path = Path(file_path)
    transcription_file = file_path.with_stem(file_path.stem + '_segments_data').with_suffix('.json')
    if transcription_file.exists():
        with open(transcription_file, 'r') as f:
            return json.load(f)
    return []

def find_word_start_time(transcription_data, word):
    for segment in transcription_data:
        if word in segment['text'].lower():
            return float(segment['start']) - 1
    return None

def identify_merged_files(folder_path):
    merged_files = []
    folder_path = Path(folder_path)

    for audio_file in folder_path.rglob('*.mp3'):
        transcription_data = read_transcription_data(audio_file)
        bonjour_start_time = find_word_start_time(transcription_data, "bonjour")
        is_merged = (bonjour_start_time is not None and bonjour_start_time > 10)

        if is_merged:
            merged_files.append((str(audio_file), bonjour_start_time))

    return merged_files

In [42]:
folder_path = "../audio_database/raw_mp3_test_transcription/"
merged_files = identify_merged_files(folder_path)
merged_files

[('..\\audio_database\\raw_mp3_test_transcription\\2023_1_15_20_12_51_ch30\\2023_1_15_20_12_51_ch30.mp3',
  88.95),
 ('..\\audio_database\\raw_mp3_test_transcription\\2023_1_15_20_24_48_ch30\\2023_1_15_20_24_48_ch30.mp3',
  83.91)]

### Cut and save audio files

In [63]:
def split_audio(file_path, split_time):
    """ Split the audio file at the specified time """
    y, sr = librosa.load(file_path, sr=None)
    split_sample = int(split_time * sr)
    return y[:split_sample], y[split_sample:], sr

def cut_and_save_audio_files(merged_files):

    for file_path, cutting_time in merged_files:
        audio = MP3(file_path)
        title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
        start_timestamp, end_timestamp = parse_timestamp_from_title(title)
        cut_timestamp = (start_timestamp + + pd.to_timedelta(cutting_time, unit='s'))

        file_path = Path(file_path)
        audio_1, audio_2, sr = split_audio(file_path, cutting_time)

        # Save the first part of the split
        start_timestamp_1 = start_timestamp.strftime("%d/%m/%Y %H:%M:%S")
        end_timestamp_1 = cut_timestamp.strftime("%d/%m/%Y %H:%M:%S")

        file_path_1 = file_path.parent.parent / f"{file_path.stem}{file_path.suffix}"
        sf.write(file_path_1, audio_1, sr)

        new_title_1 = f"{start_timestamp_1} - {end_timestamp_1}"
        copy_metadata(file_path, file_path_1, new_title_1)

        # Save the second part of the split
        start_timestamp_2 = cut_timestamp.strftime("%d/%m/%Y %H:%M:%S")
        end_timestamp_2 = end_timestamp.strftime("%d/%m/%Y %H:%M:%S")

        file_name_components = [
            str(cut_timestamp.year),
            str(cut_timestamp.month),
            str(cut_timestamp.day),
            str(cut_timestamp.hour),
            str(cut_timestamp.minute),
            str(cut_timestamp.second),
            file_path.stem.split('_')[-1]
        ]
        file_name_2 = '_'.join(file_name_components)
        file_path_2 = file_path.parent.parent / f"{file_name_2}{file_path.suffix}"
        sf.write(file_path_2, audio_2, sr)

        new_title_2 = f"{start_timestamp_2} - {end_timestamp_2}"
        copy_metadata(file_path, file_path_2, new_title_2)

        print(file_path_1, file_path_2)

In [64]:
cut_and_save_audio_files(merged_files)

..\audio_database\raw_mp3_test_transcription\2023_1_15_20_12_51_ch30.mp3 ..\audio_database\raw_mp3_test_transcription\2023_1_15_20_14_19_ch30.mp3
..\audio_database\raw_mp3_test_transcription\2023_1_15_20_24_48_ch30.mp3 ..\audio_database\raw_mp3_test_transcription\2023_1_15_20_26_11_ch30.mp3


### Now we need to rerun transcribe_all on those new files

### Then create audio dataFrame

In [66]:

def check_word_in_timeframe(transcription_data, word, start_time, end_time):
    return any(word in segment['text'].lower() for segment in transcription_data if start_time <= segment['start'] <= end_time)

def create_audio_database(folder_path):
    folder_path = Path(folder_path)
    data = []

    for audio_file in folder_path.rglob('*.mp3'):
        try:
            audio = MP3(audio_file)
            title = audio['TIT2'][0] if 'TIT2' in audio else 'Unknown'
            start_timestamp, end_timestamp = parse_timestamp_from_title(title)
            audio_length = audio.info.length

            transcription_path = audio_file.with_stem(audio_file.stem + '_transcription').with_suffix('.txt')
            # words_found = check_words_in_transcription(transcription_path, ["bonjour", "au revoir"])

            transcription_data = read_transcription_data(audio_file)

            is_start_file = check_word_in_timeframe(transcription_data, "bonjour", 0, 10)
            is_end_file = check_word_in_timeframe(transcription_data, "au revoir", max(0, audio_length - 10), audio_length)
            is_complete = is_start_file and is_end_file

            data.append({
                "File Name": audio_file.stem,
                "File Path": str(audio_file),
                "Transcription Path": str(transcription_path) if transcription_path.exists() else None,
                "Start Timestamp": start_timestamp,
                "End Timestamp": end_timestamp,
                "Audio Length": audio_length,
                "Is End File": is_end_file,
                "Is Start File": is_start_file,
                "Is Complete": is_complete,
                "Precedent File": None,
                "Next File": None
            })
        except Exception as e:
            print(f"Error processing file {audio_file}: {e}")

    return pd.DataFrame(data)

In [67]:
# Usage
folder_path = "../audio_database/raw_mp3_test_transcription/"
df = create_audio_database(folder_path)
df.head()

Unnamed: 0,File Name,File Path,Transcription Path,Start Timestamp,End Timestamp,Audio Length,Is End File,Is Start File,Is Complete,Precedent File,Next File
0,2023_1_15_10_5_41_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 10:05:41,2023-01-15 10:08:23,161.964,False,True,False,,
1,2023_1_15_10_8_23_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 10:08:23,2023-01-15 10:09:41,78.444,True,False,False,,
2,2023_1_15_18_40_13_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 18:40:13,2023-01-15 18:41:01,48.312,False,True,False,,
3,2023_1_15_18_41_2_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 18:41:02,2023-01-15 18:41:13,11.448,False,False,False,,
4,2023_1_15_18_59_42_ch30,..\audio_database\raw_mp3_test_transcription\2...,..\audio_database\raw_mp3_test_transcription\2...,2023-01-15 18:59:42,2023-01-15 19:01:31,108.72,False,True,False,,


### Regroup associated files

In [68]:
def find_associated_files(df, time_delta=20):
    associated_files = []

    # Filter DataFrame for start, other, and end files
    start_files = df[df['Is Start File']]
    other_files = df[~df['Is End File'] & ~df['Is Start File']]
    end_files = df[df['Is End File']]

    for idx, start_row in start_files.iterrows():
        end_time_beginning = start_row['End Timestamp']
        if pd.isnull(end_time_beginning):
            continue

        # First check in other_files
        other_file_found = None
        for _, other_row in other_files.iterrows():
            start_time_other = other_row['Start Timestamp']
            if pd.isnull(start_time_other):
                continue

            if 0 <= (start_time_other - end_time_beginning).total_seconds() <= time_delta:
                other_file_found = other_row
                break

        # If an other_file is found, check for a corresponding file in file_ends
        end_file_found = None
        if other_file_found is not None:
            end_time_other = other_file_found['End Timestamp']
            for _, end_row in end_files.iterrows():
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_other).total_seconds() <= time_delta:
                    end_file_found = end_row
                    break

            if end_file_found is not None:
                associated_files.append((start_row['File Path'], other_file_found['File Path'], end_file_found['File Path']))
            else:
                associated_files.append((start_row['File Path'], other_file_found['File Path']))

        # If no other_file is found, check directly in file_ends
        elif other_file_found is None:
            for _, end_row in end_files.iterrows():
                start_time_ending = end_row['Start Timestamp']
                if pd.isnull(start_time_ending):
                    continue

                if 0 <= (start_time_ending - end_time_beginning).total_seconds() <= time_delta:
                    associated_files.append((start_row['File Path'], end_row['File Path']))
                    break

    return associated_files

associated_files = find_associated_files(df)
for files in associated_files:
    print(f"Associated files: {', '.join(files)}")

Associated files: ..\audio_database\raw_mp3_test_transcription\2023_1_15_10_5_41_ch30\2023_1_15_10_5_41_ch30.mp3, ..\audio_database\raw_mp3_test_transcription\2023_1_15_10_8_23_ch30\2023_1_15_10_8_23_ch30.mp3
Associated files: ..\audio_database\raw_mp3_test_transcription\2023_1_15_18_40_13_ch30\2023_1_15_18_40_13_ch30.mp3, ..\audio_database\raw_mp3_test_transcription\2023_1_15_18_41_2_ch30\2023_1_15_18_41_2_ch30.mp3
Associated files: ..\audio_database\raw_mp3_test_transcription\2023_1_15_18_59_42_ch30\2023_1_15_18_59_42_ch30.mp3, ..\audio_database\raw_mp3_test_transcription\2023_1_15_19_1_32_ch30\2023_1_15_19_1_32_ch30.mp3
Associated files: ..\audio_database\raw_mp3_test_transcription\2023_1_15_20_11_51_ch30\2023_1_15_20_11_51_ch30.mp3, ..\audio_database\raw_mp3_test_transcription\2023_1_15_20_12_51_ch30\2023_1_15_20_12_51_ch30.mp3
Associated files: ..\audio_database\raw_mp3_test_transcription\2023_1_15_20_24_16_ch30\2023_1_15_20_24_16_ch30.mp3, ..\audio_database\raw_mp3_test_transcrip

### Copy grouped files and merge the audio

In [71]:
def copy_associated_files_to_group_folder(associated_files, df, base_folder):
    base_folder = Path(base_folder)

    for group in associated_files:
        # Create a unique folder for each group

        audio_files = []
        start_timestamp = None
        end_timestamp = None

        for file_path in group:
            # Copy each file in the group to the group folder
            
            # file_name = Path(file_path).name
            # destination = group_folder / file_name
            # shutil.copy(file_path, destination)

            # Add file to list for merging
            audio_files.append(file_path)

            # Extract timestamps for merged file metadata
            file_info = df[df['File Path'] == file_path].iloc[0]
            if start_timestamp is None or file_info['Start Timestamp'] < start_timestamp:
                start_timestamp = file_info['Start Timestamp']
            if end_timestamp is None or file_info['End Timestamp'] > end_timestamp:
                end_timestamp = file_info['End Timestamp']

        # Merge and save audio
        merged_audio, sample_rate = merge_audios(audio_files)
        merged_file_path = base_folder / Path(audio_files[0]).name
        sf.write(merged_file_path, merged_audio, sample_rate)
        
        # Copy metadata from the beginning file to the merged file
        start_timestamp = start_timestamp.strftime("%d/%m/%Y %H:%M:%S")
        end_timestamp = end_timestamp.strftime("%d/%m/%Y %H:%M:%S")

        new_title = f"{start_timestamp} - {end_timestamp}"
        copy_metadata(group[0], merged_file_path, new_title)

In [72]:
# Usage
base_folder = "../audio_database/raw_mp3_test_transcription/"
copy_associated_files_to_group_folder(associated_files, df, base_folder)