# Quran Video Generation Notebook

This notebook generates videos for Quran verses by combining:
- Audio recitation with precise word-level timestamps
- Background video clips selected based on verse content
- Text overlays showing Arabic text and English translations

The process involves:
1. Loading Quran text, timestamps, and translations
2. Generating background video suggestions using AI
3. Synchronizing audio, video, and text elements
4. Creating the final video output


In [1]:
import utils
from pydub import AudioSegment
from pydub.silence import detect_silence
from typing import List, Dict, Union, Tuple, Optional
import utils
from quran_utils import Reciter

In [2]:
PEXELS_API_KEY = '' # Add your Pexels API key here
OPENAI_API_KEY = '' #Add your OpenAI API key here

In [3]:
surah_number = 4
aya_start = 134
aya_end = 134
reciter = Reciter.MAHMOUD_KHALIL_AL_HUSARY
font_path = "data/fonts/Rakkas-Regular.ttf"

In [4]:
words = utils.get_words_with_timestamps(surah_number, aya_start, aya_end,reciter)
#convert time to seconds
for word in words:
    word["start"] = word["start"] / 1000
    word["end"] = word["end"] / 1000

In [5]:
#if the difference between the end of word i and the start of word i+1 is less than 0.5, set the start of word i+1 to the end of word i
new_words = words.copy()
if len(new_words) > 0 and new_words[0]["start"] < 0.5:
    new_words[0]["start"] = 0

    
for i in range(len(words) - 1):
    if new_words[i + 1]["start"] - new_words[i]["end"]  < 1:
        new_words[i + 1]["start"] = new_words[i]["end"]


# Outline the difference between new_words and words
print("Differences between new_words and words:")
print("-" * 50)
print(f"Total words: {len(words)}")

differences_found = False
for i, (original, adjusted) in enumerate(zip(words, new_words)):
    # Check if there are differences in start or end times
    if original["start"] != adjusted["start"] or original["end"] != adjusted["end"]:
        differences_found = True
        print(f"Word {i+1}: '{original['word']}'")
        print(f"  Original: start={original['start']:.2f}s, end={original['end']:.2f}s")
        print(f"  Adjusted: start={adjusted['start']:.2f}s, end={adjusted['end']:.2f}s")
        print(f"  Difference: start={adjusted['start']-original['start']:.2f}s, end={adjusted['end']-original['end']:.2f}s")
        print()

if not differences_found:
    print("No differences found between words and new_words.")
    

new_words = words
utils.display_words_table(words)

Differences between new_words and words:
--------------------------------------------------
Total words: 14
No differences found between words and new_words.


Word,Translation,Start Time (s),End Time (s),Duration (s),Aya,Word Position
مَنْ,Whoever,0.0,0.07,0.07,134,1
كَانَ,[is],1.89,1.99,0.1,134,2
يُرِيدُ,desires,3.13,3.23,0.1,134,3
ثَوَابَ,reward,4.4,5.64,1.24,134,4
الدُّنْيَا,(of) the world,6.89,6.99,0.1,134,5
فَعِنْدَ,then with,6.99,9.24,2.25,134,6
اللَّهِ,Allah,9.24,10.6,1.36,134,7
ثَوَابُ,(is the) reward,10.6,12.2,1.6,134,8
الدُّنْيَا,(of) the world,13.37,13.47,0.1,134,9
وَالْآخِرَةِ,and the Hereafter,13.47,16.6,3.13,134,10


In [6]:

def adjust_word_timestamps(
    words: List[Dict[str, Union[str, float]]],
    audio_path: str,
    min_silence_len: int = 100,  # minimum silence length in ms
    silence_thresh: int = -40,   # silence threshold in dB
    seek_step: int = 1,          # step size for silence detection in ms
    padding: int = 50            # padding around silence in ms
) -> List[Dict[str, Union[str, float]]]:
    """
    Adjust word timestamps based on silence detection between words.
    
    Args:
        words: List of dictionaries with keys 'word', 'start', 'end'
        audio_path: Path to the audio file
        min_silence_len: Minimum length of silence to be considered (in ms)
        silence_thresh: Silence threshold in dB
        seek_step: Step size for silence detection (in ms)
        padding: Padding around silence (in ms)
        
    Returns:
        List of dictionaries with adjusted timestamps
    """
    # Load audio file
    audio = AudioSegment.from_file(audio_path)
    
    # Convert timestamps to milliseconds
    words_ms = []
    for word in words:
        temp_word = word.copy()
        temp_word['start'] = int(word['start'] * 1000)  # convert to ms
        temp_word['end'] = int(word['end'] * 1000)       # convert to ms
        words_ms.append(temp_word)
    
    # Detect silence in the audio
    silent_ranges = detect_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        seek_step=seek_step
    )
    
    # Adjust timestamps based on silence detection
    adjusted_words = []
    for i in range(len(words_ms)):
        current_word = words_ms[i].copy()
        
        # If this is not the last word
        if i < len(words_ms) - 1:
            next_word = words_ms[i + 1]
            #gap = next_word['start'] - current_word['end']
            
            # Find silence in the gap between words
            silence_in_gap = [
                (start, end) for start, end in silent_ranges
                if start >= current_word['end'] and end <= next_word['start']
            ]
            
            if silence_in_gap:
                # If there is silence, extend current word to start of silence
                # and next word from end of silence
                silence_start = silence_in_gap[0][0]
                silence_end = silence_in_gap[-1][1]
                
                # Add padding around silence
                silence_start = max(current_word['end'], silence_start - padding)
                silence_end = min(next_word['start'], silence_end + padding)
                
                current_word['end'] = silence_start
                next_word['start'] = silence_end
            else:
                
                # If no silence, extend current word to start of next word
                current_word['end'] = next_word['start']
        else:
            if len(silent_ranges) > 0 and  current_word['end'] <= silent_ranges[-1][0]:
                current_word['end'] = silent_ranges[-1][0]
        
        # Convert back to seconds
        current_word['start'] = current_word['start'] / 1000
        current_word['end'] = current_word['end'] / 1000
        adjusted_words.append(current_word)
    
    return adjusted_words



In [7]:
new_words = adjust_word_timestamps(words,"temp/audio/combined_audio.mp3")

# Calculate and display the average time difference
original_durations = [(word['end'] - word['start']) for word in words]
adjusted_durations = [(word['end'] - word['start']) for word in new_words]

avg_original_duration = sum(original_durations) / len(original_durations)
avg_adjusted_duration = sum(adjusted_durations) / len(adjusted_durations)

print(f"\nAverage word duration (original): {avg_original_duration:.2f} seconds")
print(f"Average word duration (adjusted): {avg_adjusted_duration:.2f} seconds")
print(f"Average duration change: {avg_adjusted_duration - avg_original_duration:.2f} seconds")

# Count words with significant changes
threshold = 0.2  # 200ms threshold
significant_changes = sum(1 for i in range(len(words)) if 
                         abs((new_words[i]['end'] - new_words[i]['start']) - 
                             (words[i]['end'] - words[i]['start'])) > threshold)

print(f"\nWords with significant timing changes: {significant_changes} out of {len(words)} ({significant_changes/len(words)*100:.1f}%)")
words = new_words


Average word duration (original): 1.07 seconds
Average word duration (adjusted): 1.83 seconds
Average duration change: 0.76 seconds

Words with significant timing changes: 9 out of 14 (64.3%)


In [8]:
# Merge words with short durations (less than 0.5 seconds) with the next word
merged_words = []
i = 0
while i < len(words):
    current_word = words[i]
    
    # Check if this is the last word or if the duration is >= 0.5 seconds
    if i == len(words) - 1 or (current_word['end'] - current_word['start']) >= 0.5:
        merged_words.append(current_word)
        i += 1
    else:
        # Check if current word ends at the start of the next word
        next_word = words[i + 1]
        if abs(current_word['end'] - next_word['start']) < 0.01:  # Small threshold for floating point comparison
            # Merge the words
            # Clone the current word and then modify it
            merged_word = current_word.copy()
            # Update the properties
            merged_word['word'] = current_word['word'] + ' ' + next_word['word']
            # Combine translations from both words
            if 'translation' in current_word and 'translation' in next_word:
                merged_translation = {}
                # Combine translations from both words
                all_keys = set(current_word['translation'].keys()).union(set(next_word['translation'].keys()))
                for key in all_keys:
                    merged_translation[key] = f"{current_word['translation'].get(key, '')} {next_word['translation'].get(key, '')}".strip()
                    
                merged_word['translation'] = merged_translation
            merged_word['end'] = next_word['end']
            # Preserve aya number if it exists
            merged_word['aya'] = current_word.get('aya', next_word.get('aya', None))
            merged_words.append(merged_word)
            i += 2  # Skip the next word since we merged it
        else:
            # If they don't connect, keep the current word as is
            merged_words.append(current_word)
            i += 1

# Replace the original words list with the merged one
words = merged_words

# Print statistics about the merging
original_count = len(new_words)
merged_count = len(words)
print(f"\nMerged {original_count - merged_count} words with duration < 0.5s")
print(f"Original word count: {original_count}")
print(f"New word count after merging: {merged_count}")
utils.display_words_table(words)


Merged 2 words with duration < 0.5s
Original word count: 14
New word count after merging: 12


Word,Translation,Start Time (s),End Time (s),Duration (s),Aya,Word Position
مَنْ,Whoever,0.0,1.89,1.89,134,1
كَانَ,[is],1.89,3.13,1.24,134,2
يُرِيدُ,desires,3.13,4.4,1.27,134,3
ثَوَابَ,reward,4.4,6.89,2.49,134,4
الدُّنْيَا فَعِنْدَ,(of) the world then with,6.89,9.24,2.35,134,5
اللَّهِ,Allah,9.24,10.6,1.36,134,7
ثَوَابُ,(is the) reward,10.6,13.37,2.77,134,8
الدُّنْيَا وَالْآخِرَةِ,(of) the world and the Hereafter,13.37,16.959,3.59,134,9
وَكَانَ,And is,18.647,20.01,1.36,134,11
اللَّهُ,Allah,20.01,21.28,1.27,134,12


In [9]:
import LLM_utils

suggestions = LLM_utils.get_video_suggestions(words, OPENAI_API_KEY)
# Print each suggestion with its index
for i, suggestion in enumerate(suggestions):
    print(f"[{i}] {suggestion.keywords} ({suggestion.start_time}s - {suggestion.end_time}s)")

[0] earth landscape (0.0s - 6.89s)
[1] mosque islam (6.89s - 10.6s)
[2] heavenly sky (10.6s - 16.959s)
[3] crescent moon night sky (18.647s - 21.28s)
[4] open sky clouds (21.28s - 27.36s)


In [10]:
from pexel_utils import select_and_download_video, VideoOrientation, VideoQuality

# Create a list to store video objects
background_videos = []

# Process each video suggestion from the LLM
for suggestion in suggestions:
    # Extract keywords and timing information
    search_term = suggestion.keywords
    start_time = suggestion.start_time
    end_time = suggestion.end_time
    
    # Download a video for this suggestion
    video_path = select_and_download_video(
        api_key=PEXELS_API_KEY,
        query=search_term,
        orientation=VideoOrientation.LANDSCAPE,
        size=VideoQuality.HD,
        selection_method="best",
        output_dir="temp/video",
        duration=end_time-start_time,
    )
    
    if video_path:
        # Add video object to our list
        background_videos.append({
            "file_path": video_path,
            "start": start_time,
            "end": end_time
        })
        print(f"Downloaded video for '{search_term}' ({start_time:.2f}s - {end_time:.2f}s): {video_path}")
    else:
        print(f"Failed to download video for '{search_term}' ({start_time:.2f}s - {end_time:.2f}s)")


Downloaded video for 'earth landscape' (0.00s - 6.89s): temp/video/pexels_video_32157404.mp4
Downloaded video for 'mosque islam' (6.89s - 10.60s): temp/video/pexels_video_5798349.mp4
Downloaded video for 'heavenly sky' (10.60s - 16.96s): temp/video/pexels_video_32227116.mp4
Downloaded video for 'crescent moon night sky' (18.65s - 21.28s): temp/video/pexels_video_905250.mp4
Downloaded video for 'open sky clouds' (21.28s - 27.36s): temp/video/pexels_video_32125176.mp4


# ⚠️ **IMPORTANT DISCLAIMER** ⚠️

Please carefully review the generated video output before use. The background videos are sourced from external APIs 
and may occasionally contain content that is not Islamically compliant. While we apply filters to avoid inappropriate 
content, manual review is recommended to ensure the final video meets Islamic guidelines and standards.

Always verify that the background imagery aligns with your intended use and religious requirements.


In [11]:
import os
from moviepy import AudioFileClip, CompositeVideoClip,ColorClip
from moviepy.video.io.VideoFileClip import VideoFileClip



def create_word_timed_video(
    words: List[Dict[str, Union[str, float]]], 
    audio_path: str,
    output_path: str = "output_video.mp4",
    width: int = 1080,
    height: int = 1920,
    bg_color: Tuple[int, int, int] = (0, 0, 0),
    text_color: Tuple[int, int, int] = (255, 255, 255),
    font_path: Optional[str] = None,
    font_size: int = 80,
    background_videos: Optional[List[Dict[str, Union[str, float]]]] = None
) -> str:
    """
    Create a 9:16 video with words displayed at specific time ranges.
    
    Args:
        words: List of dictionaries with keys 'word', 'start', 'end'
        audio_path: Path to the audio file
        output_path: Path to save the output video
        width: Video width (default: 1080 for 9:16 ratio)
        height: Video height (default: 1920 for 9:16 ratio)
        bg_color: Background color as RGB tuple
        text_color: Text color as RGB tuple
        font_path: Path to custom font file (optional)
        font_size: Font size for the text
        background_videos: List of dictionaries with keys 'file_path', 'start', 'end'
        
    Returns:
        Path to the created video file
    """
    # Load audio to get duration
    audio = AudioFileClip(audio_path)
    duration = audio.duration
    

    default_background = ColorClip(duration=duration, size=(width, height), color=bg_color)
    default_background.audio = audio
    
    overlay = ColorClip(size=(width, height), color=(0, 0, 0), duration=duration)
    overlay = overlay.with_opacity(0.4)  
   
    
    # Load background videos if provided
    bg_video_clips = []
    if background_videos:
        for bg_video in background_videos:
            file_path = bg_video['file_path']
            start_time = bg_video['start']
            end_time = bg_video['end']
            
            if os.path.exists(file_path):
                
                clip = None
                # Load the video clip
                with utils.nostdout():
                    clip = VideoFileClip(file_path)
                clip.without_audio()
                
                # Resize to match dimensions
                clip = clip.resized(width=width)
                
                # Set the start and end times
                clip = clip.subclipped(0, end_time-start_time)
                clip =clip.with_position('center')
                
                # Set the position in the final video
                clip.start = start_time
                clip.end = end_time
                
                bg_video_clips.append(clip)
    videos = []
    for word in words:
        img_video = utils.create_text_image(word['word'],word['start'],word['end'],width,height,font_path,font_size,word['translation']['en'],text_color)
        videos.append(img_video)
        
    # Add the main video with text as the top layer
    all_clips = [default_background] + bg_video_clips + [overlay] + videos
    final_video = CompositeVideoClip(all_clips, size=(width, height))
    final_video.audio = audio
    
    # Write output file
    final_video.write_videofile(
        output_path,
        fps=24,
        codec='libx264',
        audio_codec='aac',
        preset='ultrafast',
        threads=4
    )
     
    
    for video in all_clips + [final_video]:
        video.close()
    return output_path


create_word_timed_video(words, "temp/audio/combined_audio.mp3", f"{surah_number}-{aya_start}-{aya_end}-{reciter.name}.mp4", background_videos=background_videos, font_path=os.path.abspath(font_path))

MoviePy - Building video 4-134-134-MAHMOUD_KHALIL_AL_HUSARY.mp4.
MoviePy - Writing audio in 4-134-134-MAHMOUD_KHALIL_AL_HUSARYTEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
MoviePy - Writing video 4-134-134-MAHMOUD_KHALIL_AL_HUSARY.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready 4-134-134-MAHMOUD_KHALIL_AL_HUSARY.mp4


'4-134-134-MAHMOUD_KHALIL_AL_HUSARY.mp4'

In [12]:
# Clean up temporary files
import shutil

try:
    if os.path.exists('temp'):
        shutil.rmtree('temp')
        print("Temporary files cleaned up successfully")
except Exception as e:
    print(f"Error cleaning up temporary files: {e}")


Temporary files cleaned up successfully
