# Audio files

## Splitting

In [1]:
from pydub import AudioSegment
import os
os.chdir('../code/said_main/data/audio/FaceTalk_Romain_Darous_val/long_files')

In [2]:
def split_wav(input_files, output_folder, duration=10000):
    # Split the audio file into segments
    idx = 1
    for input_file in input_files :
        # Load the audio file
        sound = AudioSegment.from_file(input_file, format = "wav", frame_rate = 16_000)
        
        # Calculate the number of segments
        num_segments = len(sound) // duration
        
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)
        for i in range(num_segments):
            start_time = i * duration
            end_time = (i + 1) * duration
            segment = sound[start_time:end_time]
            segment.export(os.path.join(output_folder, f"segment_{idx:02}_{i:02}.wav"), format="wav")
    
        last = i + 1
        segment = sound[last*duration:]
        segment.export(os.path.join(output_folder, f"segment_{idx:02}_{last:02}.wav"), format="wav")
        idx += 1

# Example usage
input_files = [path for path in os.listdir()]
output_folder = "../"
split_wav(input_files, output_folder)


## Resampling

# Blendshapes

In [1]:
import pandas as pd
import os
from pydub import AudioSegment
import os
import datetime
import wave
import contextlib

In [2]:
os.chdir('../code/said_main/data')


# Rename files

### CSV

In [10]:
"""blend_path = './blendshape_coeffs/FaceTalk_Romain_Darous_train/raw_segment/'
output_folder = './blendshape_coeffs/FaceTalk_Romain_Darous_train/'
files = sorted([path for path in os.listdir(blend_path) if path.endswith('.csv')])

for i, file in enumerate(files) :
    df = pd.read_csv(blend_path + file)
    df.to_csv(os.path.join(output_folder, f'sequence{i+1:04}.csv'), index=False)
"""

### WAV

In [12]:
blend_path = './audio/FaceTalk_Romain_Darous_train/'

files = sorted([path for path in os.listdir(blend_path) if path.endswith('.wav')])

for i, file in enumerate(files) : 
    os.rename(blend_path + file, blend_path + f'sequence{i+1:04}.wav')

# Timecode processing

In [3]:
import re
import numpy as np

def parse_time_string(time_str):
    pattern = r'(\d{2}):(\d{2}):(\d{2}):(\d{2})\.(\d{3})'
    match = re.match(pattern, time_str)
    
    if not match:
        raise ValueError("Time format should be HH:mm:ss:ff.mmm")
    
    hours, minutes, seconds, frames, milliseconds = map(int, match.groups())
    return hours, minutes, seconds, frames, milliseconds

def time_to_milliseconds(hours, minutes, seconds, frames, milliseconds):
    frames_per_second = 30
    total_milliseconds = (
        hours * 3600 * 1000 +
        minutes * 60 * 1000 +
        seconds * 1000 +
        (frames * 1000 / frames_per_second) +
        milliseconds
    )
    return total_milliseconds

def diff(time, past_time) :
    hours, minutes, seconds, frames, milliseconds = parse_time_string(time)
    ms_time = time_to_milliseconds(hours, minutes, seconds, frames, milliseconds)
    
    hours, minutes, seconds, frames, milliseconds = parse_time_string(past_time)
    ms_past_time = time_to_milliseconds(hours, minutes, seconds, frames, milliseconds)
    return ms_time - ms_past_time

# Splitting files

In [6]:
# Version using time_code
def split_csv(input_files, output_folder, audio_path, blend_path):
    # Split the CSV files into segments based on time code
    for input_file in input_files:
        # Load the CSV file
        df = pd.read_csv(blend_path + input_file)
        audio_files = [path for path in os.listdir(audio_path) if path.endswith('.wav') and int(path.split('_')[1]) == int(input_file.split('.')[0][-2:])]
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        index = 0
        start_index = 0
        start_time = df['Timecode'][0]
        time = df.iloc[0]['Timecode']
        delta = diff(time, start_time)
        acc_delta = 0
        
        for audio in audio_files :
        
            with contextlib.closing(wave.open(audio_path + audio, 'r')) as wf:
                frames = wf.getnframes()
                rate = wf.getframerate()
                duration = frames / float(rate)

            seq = int(audio.split('_')[1])
            sub_seq = int(audio.split('_')[2].split('.')[0])

            if audio == audio_files[-1] :
                # Write the last segment to a CSV file
                segment_df = pd.DataFrame(df.iloc[start_index:]).drop(df.columns[:2], axis=1)
                segment_df.to_csv(os.path.join(output_folder, f"segment_{seq:02}_{sub_seq:02}.csv"), index=False)
                break
            

            # Gathering the subset og blendshape
            while delta < duration*1000 and index < len(df) - 1:
                index += 1
                time = df.iloc[index]['Timecode']
                delta = diff(time, start_time)
            
            acc_delta += delta - duration*1000
            # Storing the subset of blendshapes
            if acc_delta < 22:
                segment_df = pd.DataFrame(df.iloc[start_index:index + 1]).drop(df.columns[:2], axis=1)
            else :
                segment_df = pd.DataFrame(df.iloc[start_index:index]).drop(df.columns[:2], axis=1)
                index -= 1
                acc_delta = 0

            # Write the last segment to a CSV file
            segment_df.to_csv(os.path.join(output_folder, f"segment_{seq:02}_{sub_seq:02}.csv"), index=False)

            # Resetting parameters
            start_index = index
            time = df.iloc[index]['Timecode']
            start_time = time
            delta = 0


            
# Example usage
audio_path = './audio/FaceTalk_Romain_Darous_val/'
blend_path = './blendshape_coeffs/FaceTalk_Romain_Darous_val/long_files_old/'
input_files = [path for path in os.listdir(blend_path) if path.endswith('.csv')]
output_folder = "./blendshape_coeffs/FaceTalk_Romain_Darous_val/"
split_csv(input_files, output_folder, audio_path, blend_path)


## Former splitting method

In [None]:
# Nice version with frame counting, but still some delay (small but it's there)
def split_csv(input_files, output_folder):
    # Split the CSV files into segments based on time code
    idx = 1
    for input_file in input_files:
        # Load the CSV file
        df = pd.read_csv(input_file)

        
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)
        
        sec_ref = int(df['Timecode'][0].split(':')[-1].split('.')[0])
        total_sec = 0
        start_index = 0

        for index, row in df.iterrows():
            # Retrieve the time code from the row
            time_code = row['Timecode']  # Assuming the time code column is named 'Time'
            
            # Extract the seconds value
            seconds = int(time_code.split(':')[-1].split('.')[0])

            if seconds ==  sec_ref and index != start_index : total_sec += 1
            elif start_index > 0 :
                prev_time_code = df.iloc[index - 1]['Timecode']
                prev_seconds = int(prev_time_code.split(':')[-1].split('.')[0])
                if prev_seconds < sec_ref and seconds > sec_ref : total_sec += 1
            
            if total_sec == 10 :
                # Write the last segment to a CSV file
                segment_df = pd.DataFrame(df.iloc[start_index:index]).drop(df.columns[:2], axis=1)
                segment_df.to_csv(os.path.join(output_folder, f"segment_{idx:04}.csv"), index=False)
                idx += 1
                total_sec = 0
                start_index = index

# Example usage
input_files = [path for path in os.listdir() if path.endswith('.csv')]
output_folder = "../"
split_csv(input_files, output_folder)
