<a href="https://colab.research.google.com/github/OVolkova/everyday_life_googlecolab_notebooks/blob/main/Split_video_on_sections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install youtube-transcript-api
!pip install openai
!pip install langchain

# Get sections of youtube video.
You need to have video id and openai api key for that

In [4]:
import os

from typing import List

import openai
import pandas as pd

from IPython.display import display, HTML

from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.prompts import ChatPromptTemplate
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

from pydantic import BaseModel, Field

from youtube_transcript_api import YouTubeTranscriptApi

Set OpenAI api key

In [5]:
openai.api_key = <put here your key>
os.environ['OPENAI_API_KEY'] = openai.api_key

Set pydantic classes for tools

In [6]:
class Section(BaseModel):
    """Section of video."""
    name: str = Field(description="Provide a short maximum 5 words name of the section")
    summary: str = Field(description="Provide a concise  1 sentence summary of the content.")
    start_time: str = Field(description="Provide the start time of section. Use float format with '.' as separator")
    end_time: str = Field(description="Provide the end time of section")

class Info(BaseModel):
    """information to extract"""
    sections: List[Section]

Set prompt

In [7]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """
    It is a video transcript from the middle of lecture.
    Think carefully, and then extract list sections of the video as instructed.
    Rules to follow:
    1. There is no introdution in the lecture.
    2. Never name section Introduction
    3. Each Section should be 3-10 minutes.
    4. Start time of next section must be equal to the end time of previous.
    """),
    ("user", "{input}")
])

Set chain with the tool

In [37]:
model = ChatOpenAI(temperature=0, model = 'gpt-4-1106-preview')

overview_tagging_function = [
    convert_pydantic_to_openai_function(Info)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Info"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

helper functions to do the logic

In [32]:
def get_youtube_subtitles(video_id: str)-> pd.DataFrame:
  """
  Load subtitles and prepare
  """
  result = pd.DataFrame(YouTubeTranscriptApi.get_transcript(video_id))
  result['start'] = (result['start']/60).round(1)
  result['text'] = result.groupby('start')['text'].transform(lambda x: ' '.join(x))
  result = result[['text', 'start']].drop_duplicates()
  result.reset_index(drop=True, inplace=True)
  return result


def format_model_output(result: dict) -> pd.DataFrame:
  """
  Format model output json into dataframe and enrich with information
  """
  final = pd.DataFrame(result)
  final['end_time'] = final['end_time'].str.replace(':', '.').astype(float)
  final['start_time'] = final['start_time'].str.replace(':', '.').astype(float)
  final['duration'] = final['end_time'] - final['start_time']
  final['progress %'] = (final['end_time'] / final['end_time'].max() * 100).round(0).astype(int)

  final['summary'] = final['summary'].apply(split_on_lines)
  return final


def split_on_lines(text: str, total_in_line: int=50) -> str:
  """
  Split long summary on lines for pretty print
  """

  l = text.replace('\n', '').split(' ')
  t = 0
  result = ''
  for w in l:
    if t + len(w) >=total_in_line:
      t = 0
      result +='\\n'
    result += w + ' '
    t+=len(w)
  return result


def get_video_plan_from_subtitles(subtitles: pd.DataFrame, tagging_chain, chunk_size: int=13000) -> pd.DataFrame:
  """
  Send subtitels to llm-chain to extract sections.
  Subtitles are long, so it's split on chunks
  """
  last_chunk = False
  i = 0
  sections = []

  s = subtitles.to_csv(index=False)

  while True:
    print('processing chunk ', i)
    i+=1
    r =  tagging_chain.invoke({"input": s[:chunk_size]})

    if last_chunk:
      sections.extend(r['sections'])
      break
    sections.extend(r['sections'][:-1])
    s = subtitles[subtitles[subtitles['start']==float(r['sections'][-1]['start_time'].replace(':', '.'))].index[0]:].to_csv(index=False)
    if len(s) < chunk_size:
      last_chunk = True

  return sections


def pretty_print(df: pd.DataFrame):
  """
  Pretty_print pandas as html
  """
  return display(HTML(df.to_html().replace("\\n","<br>")))


def pipeline(video_id: str, tagging_chain, chunk_size: int=13000) -> pd.DataFrame:
  """
  Pipeline to split extract sections of youtube video
  """
  subtitles = get_youtube_subtitles(video_id)
  sections = get_video_plan_from_subtitles(subtitles, tagging_chain, chunk_size=chunk_size)
  return format_model_output(sections)

Split the video

In [38]:
sections = pipeline(video_id = "plIJYzVKfdI", tagging_chain=tagging_chain)

processing chunk  0
processing chunk  1
processing chunk  2
processing chunk  3
processing chunk  4
processing chunk  5
processing chunk  6
processing chunk  7


In [39]:
pretty_print(sections)

Unnamed: 0,name,summary,start_time,end_time,duration,progress %
0,Statistical Learning Basics,"The lecture begins with an introduction to the basic elements of statistical learning, including data, models, error metrics, and estimation algorithms.",0.7,2.9,2.2,4
1,Data and Supervised Learning,"The lecturer discusses data in the context of supervised learning, focusing on the input-output relationship and the assumption of a noiseless label generated by an unknown function.",2.9,5.4,2.5,7
2,Model and Complexity,"The concept of a model as a class of parameterized functions is introduced, along with the notion of complexity measures to organize hypotheses.",5.5,10.5,5.0,13
3,Error Metrics Overview,"The lecturer discusses error metrics, focusing on the difference between population loss (test error) and empirical loss (training error), and their relationship.",10.7,12.9,2.2,16
4,Understanding Loss Functions,"The lecturer explains the concept of loss functions, their deterministic and random nature, and the importance of understanding the fluctuations between the empirical and population loss.",13.0,15.9,2.9,20
5,Uniform Bounds and Variance,"The lecturer addresses the need for uniform bounds to control the fluctuation between empirical and population loss across all hypotheses, not just a fixed one.",16.0,18.2,2.2,23
6,Algorithmic Principles,"The lecturer introduces the principles of learning algorithms in supervised learning, focusing on empirical risk minimization and the importance of hypothesis complexity.",18.3,21.5,3.2,27
7,Constraint Optimization and Regularization,The lecture discusses the transformation of constraint optimization problems to unconstrained ones using Lagrange multipliers and introduces regularization as a method to add complexity into the loss function.,21.6,22.5,0.9,28
8,Interpolation and Overparameterization,"The concept of interpolation in machine learning is explained, particularly in the context of neural networks with many parameters, and how it relates to fitting all data points.",22.6,24.2,1.6,30
9,Analyzing ERM Algorithm Properties,The lecture begins to analyze the properties of the Empirical Risk Minimization (ERM) algorithm by considering an arbitrary hypothesis class and comparing hypotheses.,24.3,26.9,2.6,34
