<a href="https://colab.research.google.com/github/PiyushChall/Youtube_Summarizer/blob/main/Youtube_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Semantic Chunking of a Youtube Video**

# Installing modules

Installing Faster-Whisper as it allows:
1. GPU accelaration
2. Ease of use
3. Directly Calculates chuncks of the video/audio
4. High accuracy and precision
5. Supports various languages

In [1]:
!pip install faster-whisper

Collecting faster-whisper
  Downloading faster_whisper-1.0.2-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting av<13,>=11.0 (from faster-whisper)
  Downloading av-12.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.8/33.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (192.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.3/192.3 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m77.1 

Installing Pytube as it allows to downloads high resolution youtube Video directly through python code

In [2]:
!pip install pytube

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


Using Gemini AI for summarization

In [3]:
!pip install -q -U google-generativeai

# Importing modules

In [4]:
from pytube import YouTube

In [5]:
from faster_whisper import WhisperModel

In [6]:
from moviepy.editor import VideoFileClip

In [7]:
import torch

In [8]:
from os import stat

In [9]:
import google.generativeai as genai

In [10]:
from tqdm import tqdm

In [11]:
# Used to securely store your API key
from google.colab import userdata

In [12]:
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

# Selecting Device

This block of code directly identifies the connected device and assigns it to a variable for later use

In [13]:
processing_device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

This block of code automatically assigns compute_type according to the connected device

In [14]:
if processing_device == "cpu":
  computing_type = "float32"
else:
  computing_type = "float16"

# Getting Video-Link from the user

In [15]:
def user_input():
  user_link = input("Enter the YouTube link: ")
  user_format = input("Enter download format (highest or progressive) [highest]: ") or "highest"
  return user_link, user_format

# Video Downloading

Function for downloading youtube video through our python code

In [16]:
def download_video(link, resolution="highest"):
    try:
        youtube_object = YouTube(link)
        if resolution == "highest":
            video = youtube_object.streams.get_highest_resolution()
        else:
            video = youtube_object.streams.filter(progressive=True).order_by('resolution').desc().first()  # Filter progressive downloads by descending resolution
        return video.download()
        print(f"Downloaded: {youtube_object.title}")
    except Exception as e:
        print(f"Something went wrong: {e}")
        return None

# Audio Extraction

Function for extracting audio through our downloaded video

In [17]:
def video2audio(v_path):
  video_clip = VideoFileClip(v_path)
  audio_clip = video_clip.audio

  # Save the extracted audio
  audio_clip.write_audiofile("extracted_audio.wav")
  audio_input = "extracted_audio.wav"
  return audio_input

# Model selection

**Use this code accordingly in the below code** if you have Gpu for faster processing or cpu for slower processing!!

(**FASTER**) [Suggested] For **GPU**:  
model = WhisperModel(model_size, device="cuda", compute_type="float16")

(**SLOWER**) For **CPU**:  
model = WhisperModel(model_size, device="cpu", compute_type="float32")


In [18]:
def model_select(processing_device, computing_type):
  model_size = "large-v3"
  model = WhisperModel(model_size, device=processing_device, compute_type=computing_type)
  gemini_model = genai.GenerativeModel('gemini-pro')
  return model, gemini_model

# Transcribing Audio

Lists to hold all the infos genrated from the transcribtion of the audio

In [19]:
start_time = []
end_time = []
chunk_length = []
text = []

Function for extracting text from our extracted audio file

In [20]:
def transcribe(a_input, model):
  segments, info = model.transcribe(a_input, beam_size=10)
  print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
  for segment in segments:
    start_time.append(segment.start)
    end_time.append(segment.end)
    chunk_length.append(segment.end - segment.start)
    text.append(segment.text)
    paragraph = " ".join(text)

      # Add proper paragraph breaks
    paragraph = paragraph.replace(". ", ".\n")
    paragraph = paragraph.replace("! ", "! \n")
    paragraph = paragraph.replace("? ", "? \n")

In [21]:
def summarize_and_extract_key_info(gemini_model, paragraph, max_tokens=100):
  """
  Summarizes a chunk of text and extracts key information using Gemini AI Api.

  Args:
      text_chunk: String containing the chunk of text to be processed.
      max_tokens: Maximum number of tokens allowed in the summary (default: 100).

  Returns:
      A dictionary containing the summary and key information.
  """


  # Prompt for conversation summarization
  prompt = (f"Summarize the following paragraph, Try to make it easy to understand:\n{paragraph}")

  # Send request to Gemini API
  response = gemini_model.generate_content(prompt)

  # Return the conversation summary
  return response.text.strip()

# Printing all values

This function allows to print all the data in a proper manner

In [22]:
def output(text, chunk_length, end_time, start_time, gemini_model):
  for a,b,c,d in zip(text, chunk_length, end_time, start_time):
    print(f"Chunk Start Time: {d}seconds, Chunk End Time: {c}seconds, Chunk Length: {b}seconds,")
    print(f"Chunk text: \033[1m{a}\033[0m") # "\033[1m{a}\033[0m" will make the text bold in the terminal and easier to read
  conversation_summary = summarize_and_extract_key_info(gemini_model, text)
  print(f"Summary: \033[1m{conversation_summary}\033[0m")

# Main()

In [23]:
def main():
  user_link, user_format = user_input()
  video_path = download_video(user_link, user_format.lower())
  if video_path != None:
    audio_input = video2audio(video_path)
  else:
    print("Something went wrong!")
  model, gemini_model = model_select(processing_device, computing_type)
  transcribe(audio_input, model)
  output(text, chunk_length, end_time, start_time, gemini_model)

In [None]:

if __name__ == "__main__":
  main()