In [1]:
# !pip install youtube-transcript-api
# !pip install groq
# !pip install boto3
# !pip install SpeechRecognition pydub moviepy


Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2
Collecting boto3
  Downloading boto3-1.35.0-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.36.0,>=1.35.0 (from boto3)
  Downloading botocore-1.35.0-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Collecting urllib3!=2.2.0,<3,>=1.25.4 (from botocore<1.36.0,>=1.35.0->boto3)
  Using cached urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Downloading boto3-1.35.0-py3-none-any.whl (139 kB)
Downloading botocore-1.35.0-py3-none-any.whl (12.5 MB)
   ---------------------------------------- 0.0/

In [None]:
import os

# Set AWS credentials
os.environ['AWS_ACCESS_KEY_ID'] = 'Your AWS_ACCESS_KEY_ID'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'Your AWS_SECRET_ACCESS_KEY'
os.environ['AWS_DEFAULT_REGION'] = 'Your AWS_DEFAULT_REGION'
os.environ["GROQ_API_KEY"] = "Your GROQ_API_KEY"

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from groq import Groq

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

def get_transcript_summary():
    transcript = YouTubeTranscriptApi.get_transcript("eo1VG3ZyTRA") # Add the YouTube video ID of your choice

    full_transcript = ' '.join([segment['text'] for segment in transcript])

    summarization_prompt = f"Please summarize the following transcript:\n\n{full_transcript}"
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": summarization_prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    return chat_completion.choices[0].message.content

summary = get_transcript_summary()

print(summary)


Here is a summary of the transcript:

The speaker shares a 1-minute story called "A Foolish Rabbit". The story goes as follows: a rabbit is sleeping under a tree when a stone falls on its head, causing it to believe that the sky is falling down. The rabbit starts running and shouting to alert the other animals and birds. A butterfly sitting next to the rabbit laughs and says don't worry, it's just a stone that fell on the rabbit's head, not the sky falling. The moral of the story is to not believe everything others say until you see it for yourself.


In [None]:
import boto3
import IPython.display as ipd

# Initialize the Polly client
polly_client = boto3.client('polly')

# Input text you want to convert to speech
text_to_speak = get_transcript_summary()

# Request speech synthesis
response = polly_client.synthesize_speech(
    Text=text_to_speak,
    OutputFormat='mp3',
    VoiceId='Joanna'
)

# Save the audio to a file
with open("speech.mp3", "wb") as file:
    file.write(response['AudioStream'].read())

# Play the audio in Colab
ipd.Audio("speech.mp3")

In [None]:
from pydub import AudioSegment
import textwrap
import json

def format_time(milliseconds):
    hours, remainder = divmod(milliseconds, 3600000)
    minutes, remainder = divmod(remainder, 60000)
    seconds, milliseconds = divmod(remainder, 1000)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(milliseconds):03}"

def create_json_from_summary(mp3_path, json_path, summary):
    # Get the duration of the MP3 file
    audio = AudioSegment.from_mp3(mp3_path)
    duration_ms = len(audio)

    # Split the summary into lines of approximately 50 characters each
    lines = textwrap.wrap(summary, width=50)

    # Calculate the duration for each subtitle
    subtitle_duration = duration_ms / len(lines)

    json_content = []
    for i, line in enumerate(lines, start=1):
        start_time = (i - 1) * subtitle_duration
        end_time = i * subtitle_duration

        json_content.append({
            "start_time": format_time(start_time),
            "end_time": format_time(end_time),
            "text": line
        })

    # Write the JSON file
    with open(json_path, "w", encoding="utf-8") as file:
        json.dump(json_content, file, ensure_ascii=False, indent=2)

# Usage
mp3_path = "speech.mp3"
json_path = "output.json"

create_json_from_summary(mp3_path, json_path, summary)

In [11]:
from pydub import AudioSegment
import textwrap
import json
from moviepy.editor import *
from PIL import Image, ImageDraw, ImageFont
import numpy as np

def format_time(milliseconds):
    hours, remainder = divmod(milliseconds, 3600000)
    minutes, remainder = divmod(remainder, 60000)
    seconds, milliseconds = divmod(remainder, 1000)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(milliseconds):03}"

def create_slide(text, size=(1280, 720), font_size=40):
    img = Image.new('RGB', size, color='white')
    draw = ImageDraw.Draw(img)

    # Use default font
    font = ImageFont.load_default()

    wrapped_text = textwrap.fill(text, width=40)
    text_bbox = draw.multiline_textbbox((0, 0), wrapped_text, font=font)
    text_position = ((size[0] - text_bbox[2]) / 2, (size[1] - text_bbox[3]) / 2)

    draw.multiline_text(text_position, wrapped_text, font=font, fill='black', align='center')
    return np.array(img)

def create_video_from_summary(mp3_path, output_path, summary):
    # Get the duration of the MP3 file
    audio = AudioSegment.from_mp3(mp3_path)
    duration_ms = len(audio)

    # Split the summary into lines of approximately 50 characters each
    lines = textwrap.wrap(summary, width=50)

    # Calculate the duration for each subtitle
    subtitle_duration = duration_ms / len(lines)

    clips = []
    for i, line in enumerate(lines):
        start_time = i * subtitle_duration / 1000  # Convert to seconds
        duration = subtitle_duration / 1000  # Convert to seconds

        slide = create_slide(line)
        clip = ImageClip(slide).set_duration(duration)
        clip = clip.set_start(start_time)
        clips.append(clip)

    # Create the final video
    final_clip = CompositeVideoClip(clips, size=(1280, 720))
    audio_clip = AudioFileClip(mp3_path)
    final_clip = final_clip.set_audio(audio_clip)

     # Extend the duration of the final clip to match the audio
    final_clip = final_clip.set_duration(audio_clip.duration)

    # Write the video file
    final_clip.write_videofile(output_path, fps=24)

# Usage
mp3_path = "speech.mp3"
output_path = "output_video.mp4"

create_video_from_summary(mp3_path, output_path, summary)

Moviepy - Building video output_video.mp4.
MoviePy - Writing audio in output_videoTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video output_video.mp4





Moviepy - Done !
Moviepy - video ready output_video.mp4
