In [None]:
import os
import subprocess
import cv2
from tqdm import tqdm
import random

In [None]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================


# --- Path to the cloned Wav2Lip repository ---
WAV2LIP_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/synthetic_speech_imgs/Wav2Lip"

# --- Path to the pre-trained model checkpoint ---
# This is the path to the specific pre-trained model file required by Wav2Lip.
# It should be located inside the 'checkpoints' folder within the Wav2Lip directory.
WAV2LIP_CHECKPOINT = os.path.join(WAV2LIP_ROOT, "/Users/natalyagrokh/AI/ml_expressions/img_datasets/synthetic_speech_imgs/Wav2Lip/checkpoints/Wav2Lip-SD-GAN.pt")

# --- MODIFICATION: Define your source DIRECTORIES ---
# This directory should contain all your source images with pure emotional expressions (e.g., anger, happiness).
# The script will iterate through every valid image file in this folder.
SOURCE_IMAGE_DIR = "/path/to/your/source_emotion_images" # e.g., a folder with hundreds of .png files

SOURCE_AUDIO_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/neutral_speech"

# --- Path where final output will be saved ---
OUTPUT_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_datasets"

In [None]:
# ==============================================================================
# 2. SCRIPT LOGIC (Helper Functions)
# ==============================================================================
# This section contains the core functions that handle video creation,
# running the AI model, and extracting the final image frames.

def create_static_video_from_image(image_path, audio_path, output_path):
    """
    Creates a temporary, static video file from a single image.
    Wav2Lip requires a video as its face input, not a static image. This function
    uses the powerful 'ffmpeg' tool to generate a short video where the input
    image is looped for the duration of the selected audio file.
    """
    # This command tells ffmpeg to loop the input image (-loop 1), combine it with the
    # input audio, and encode it into a standard .mp4 video file.
    command = [
        'ffmpeg', '-y', # -y overwrites the output file without asking
        '-loop', '1', '-i', image_path, '-i', audio_path,
        '-c:v', 'libx264', '-tune', 'stillimage', '-c:a', 'aac', '-b:a', '192k',
        '-pix_fmt', 'yuv420p', '-shortest', output_path
    ]
    try:
        # Executes the ffmpeg command. 'check=True' will raise an error if ffmpeg fails.
        # 'capture_output=True' prevents ffmpeg's logs from cluttering the console.
        subprocess.run(command, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        # If ffmpeg fails, this block catches the error and prints a detailed message.
        print(f"   - ‚ùå ERROR: ffmpeg failed to create static video for {os.path.basename(image_path)}.")
        print(f"     - Stderr: {e.stderr.strip()}")
        return False

def run_wav2lip_inference(face_video_path, audio_path, checkpoint_path, output_path):
    """
    Constructs and runs the main Wav2Lip inference command via a subprocess.
    This function calls the core 'inference.py' script from the Wav2Lip tool,
    passing all the necessary file paths and parameters.
    """
    # Defines the path to the main script within the Wav2Lip directory.
    wav2lip_script = os.path.join(WAV2LIP_ROOT, "inference.py")
    # Assembles the full command-line instruction to run the Wav2Lip model.
    command = [
        'python', wav2lip_script, '--checkpoint_path', checkpoint_path,
        '--face', face_video_path, '--audio', audio_path, '--outfile', output_path,
        '--pads', '0', '10', '0', '0' # Defines padding around the detected face (Top, Bottom, Left, Right).
    ]
    try:
        # This command is executed from within the Wav2Lip directory ('cwd=WAV2LIP_ROOT')
        # to ensure that the Wav2Lip script can find all its necessary helper files.
        subprocess.run(command, check=True, cwd=WAV2LIP_ROOT, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
        # If the Wav2Lip script fails, this catches and prints the specific error.
        print(f"   - ‚ùå ERROR: Wav2Lip inference failed for {os.path.basename(face_video_path)}.")
        print(f"     - Stderr: {e.stderr.strip()}")
        return False

def extract_frames_from_video(video_path, output_frame_dir):
    """
    Extracts every frame from the final generated video and saves them as
    individual .png image files. These extracted frames are the final output
    that you will use as your new 'speech_action' training data.
    """
    if not os.path.exists(output_frame_dir):
        os.makedirs(output_frame_dir)

    # Uses the OpenCV library to open the video file.
    video_capture = cv2.VideoCapture(video_path)
    frame_count = 0
    # Loops through the video frame-by-frame.
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret: # 'ret' is false when there are no more frames.
            break
        # Saves each frame as a uniquely named image file.
        frame_filename = os.path.join(output_frame_dir, f"frame_{frame_count:04d}.png")
        cv2.imwrite(frame_filename, frame)
        frame_count += 1
    video_capture.release() # Releases the video file handle.

In [None]:
# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
# This is the main part of the script that orchestrates the entire process.
if __name__ == '__main__':
    # --- Step 1: Verify all initial paths and settings are correct ---
    if not all([os.path.exists(WAV2LIP_CHECKPOINT), os.path.isdir(SOURCE_IMAGE_DIR), os.path.isdir(SOURCE_AUDIO_DIR)]):
        print("‚ùå CRITICAL ERROR: A required path is invalid. Please check:")
        if not os.path.exists(WAV2LIP_CHECKPOINT): print(f"   - Wav2Lip Checkpoint: {WAV2LIP_CHECKPOINT}")
        if not os.path.isdir(SOURCE_IMAGE_DIR): print(f"   - Source Image Directory: {SOURCE_IMAGE_DIR}")
        if not os.path.isdir(SOURCE_AUDIO_DIR): print(f"   - Source Audio Directory: {SOURCE_AUDIO_DIR}")
        exit()

    # --- Step 2: Discover all valid image and audio files ---
    valid_image_extensions = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp')
    valid_audio_extensions = ('.wav', '.mp3', '.m4a', '.flac')
    
    source_images = [f for f in os.listdir(SOURCE_IMAGE_DIR) if f.lower().endswith(valid_image_extensions)]
    source_audios = [f for f in os.listdir(SOURCE_AUDIO_DIR) if f.lower().endswith(valid_audio_extensions)]

    if not source_images or not source_audios:
        print("‚ùå CRITICAL ERROR: Source image or audio directory is empty.")
        exit()
        
    print(f"‚úÖ Found {len(source_images)} source images and {len(source_audios)} audio files.")
    
    # --- Step 3: Main processing loop to iterate through all images ---
    # The 'tqdm' library creates a progress bar for the loop.
    for image_name in tqdm(source_images, desc="Processing Emotion Images"):
        try:
            current_image_path = os.path.join(SOURCE_IMAGE_DIR, image_name)
            
            # --- For each image, randomly select one audio file ---
            # This introduces variety and prevents the model from overfitting to one speaker.
            selected_audio_name = random.choice(source_audios)
            current_audio_path = os.path.join(SOURCE_AUDIO_DIR, selected_audio_name)

            # --- Setup unique output paths for the current image ---
            # Creates a subfolder named after the image to keep outputs organized.
            source_image_basename = os.path.splitext(image_name)[0]
            session_output_dir = os.path.join(OUTPUT_DIR, source_image_basename)
            os.makedirs(session_output_dir, exist_ok=True)
            
            # Defines the full paths for the temporary and final video files.
            temp_video_path = os.path.join(session_output_dir, "temp_face_video.mp4")
            generated_video_path = os.path.join(session_output_dir, "generated_speech.mp4")
            output_frames_dir = os.path.join(session_output_dir, "synthetic_speech_action_frames")

            # --- Run the 3-step pipeline for the current image ---
            if create_static_video_from_image(current_image_path, current_audio_path, temp_video_path):
                if run_wav2lip_inference(temp_video_path, current_audio_path, WAV2LIP_CHECKPOINT, generated_video_path):
                    extract_frames_from_video(generated_video_path, output_frames_dir)

        except Exception as e:
            # This 'try...except' block ensures that if one image fails for any reason,
            # the script will print an error and continue to the next image,
            # preventing the entire batch process from crashing.
            print(f"\n--- ‚ö†Ô∏è An unexpected error occurred while processing {image_name}. Skipping. ---")
            print(f"   - Error: {e}\n")
            continue

    print("\nüéâ Batch processing complete.")