In [1]:
import configparser, os, re
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

In [18]:
#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs, prompt_template = ""):
    #input: docs of the web page
    from langchain.chains.llm import LLMChain
    from langchain.prompts import PromptTemplate
    from langchain.chains.combine_documents.stuff import StuffDocumentsChain
    # Define prompt
    if prompt_template == "":
        prompt_template = """The input below is the SRT subtitle of a video. Who is the main character of this video? 
        List the major sections of the video with time stamp. Format the time stamp to seconds.
        "{text}"
        Output:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    #llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#youtube loader and return docs for the transcript with time stamps
def loader(link:str, language=["zh"], db_loc = "./cache/YTDBT", overwrite = False):
    #load video info from link
    from langchain.document_loaders import YoutubeLoader
    try:
        loader = YoutubeLoader.from_youtube_url(
            link, add_video_info=True, language=language
        )
        docs = loader.load()
    except Exception as e:
        return(None);
        
    #check whether the vid is already in the DB
    this_id = docs[0].metadata['source']
    from langchain.vectorstores import Chroma
    from langchain.embeddings import OpenAIEmbeddings
    db = Chroma(persist_directory=db_loc, embedding_function=OpenAIEmbeddings())
    tmp = db.get()['ids']
    this_db_list = [x.split("_")[0] for x in tmp]
    this_db_set = set(this_db_list)
    if this_id in this_db_set:
        if overwrite == False:
            return(link+" link already in the db, skip");    
            
    #format the transcript into SRT
    from youtube_transcript_api import YouTubeTranscriptApi
    from youtube_transcript_api.formatters import SRTFormatter
    transcript = YouTubeTranscriptApi.get_transcript(docs[0].metadata['source'],languages=language)
    formatter = SRTFormatter()
    srt_formatted = formatter.format_transcript(transcript)
    with open("./cache/Output.srt", "w") as text_file:
        text_file.write(srt_formatted)
    
    #summarize the SRT with time stamps
    from langchain.chat_models import ChatOpenAI
    llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview")
    from langchain.docstore.document import Document
    output_srt = Document(page_content="Title: "+docs[0].metadata['title']+" \nSRT: "+srt_formatted, metadata=docs[0].metadata);
    res = story_summary_stuff([output_srt])
    
    #write the summary into DB
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.docstore.document import Document
    output_doc = Document(page_content=res, metadata=docs[0].metadata);
    db.add_documents([output_doc], ids = [docs[0].metadata['source']])
    return(output_doc)


In [21]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview")
res = loader("https://www.youtube.com/watch?v=WUHuxY5NPx4",["en"])
print(res)

KeyboardInterrupt: 

In [15]:
from pytube import Playlist
p = Playlist('https://www.youtube.com/watch?v=whqlwGpMGsA&list=PLn2_heBLYy7-92SSW4nCc3i71-J85Dp5T&ab_channel=TeyvatHistoria')
for url in p.video_urls:
    res = loader(url,["en"])
    print(res.metadata['title'])

Genshin Impact lore - Fischl - Who is the Prinzessin Der Verurteilung?
Genshin Impact - The Origins of Diluc
Genshin Impact lore - The Story of Storm Terror


NoTranscriptFound: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=VdoqtjQJcZ0! This is most likely caused by:

No transcripts were found for any of the requested language codes: ['en']

For this video (VdoqtjQJcZ0) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - vi ("Vietnamese (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Divehi")
 - nl ("Dutch")
 - en ("English")
 - eo ("Esperanto")
 - et ("Estonian")
 - ee ("Ewe")
 - fil ("Filipino")
 - fi ("Finnish")
 - fr ("French")
 - gl ("Galician")
 - lg ("Ganda")
 - ka ("Georgian")
 - de ("German")
 - el ("Greek")
 - gn ("Guarani")
 - gu ("Gujarati")
 - ht ("Haitian Creole")
 - ha ("Hausa")
 - haw ("Hawaiian")
 - iw ("Hebrew")
 - hi ("Hindi")
 - hmn ("Hmong")
 - hu ("Hungarian")
 - is ("Icelandic")
 - ig ("Igbo")
 - id ("Indonesian")
 - ga ("Irish")
 - it ("Italian")
 - ja ("Japanese")
 - jv ("Javanese")
 - kn ("Kannada")
 - kk ("Kazakh")
 - km ("Khmer")
 - rw ("Kinyarwanda")
 - ko ("Korean")
 - kri ("Krio")
 - ku ("Kurdish")
 - ky ("Kyrgyz")
 - lo ("Lao")
 - la ("Latin")
 - lv ("Latvian")
 - ln ("Lingala")
 - lt ("Lithuanian")
 - lb ("Luxembourgish")
 - mk ("Macedonian")
 - mg ("Malagasy")
 - ms ("Malay")
 - ml ("Malayalam")
 - mt ("Maltese")
 - mi ("Māori")
 - mr ("Marathi")
 - mn ("Mongolian")
 - ne ("Nepali")
 - nso ("Northern Sotho")
 - no ("Norwegian")
 - ny ("Nyanja")
 - or ("Odia")
 - om ("Oromo")
 - ps ("Pashto")
 - fa ("Persian")
 - pl ("Polish")
 - pt ("Portuguese")
 - pa ("Punjabi")
 - qu ("Quechua")
 - ro ("Romanian")
 - ru ("Russian")
 - sm ("Samoan")
 - sa ("Sanskrit")
 - gd ("Scottish Gaelic")
 - sr ("Serbian")
 - sn ("Shona")
 - sd ("Sindhi")
 - si ("Sinhala")
 - sk ("Slovak")
 - sl ("Slovenian")
 - so ("Somali")
 - st ("Southern Sotho")
 - es ("Spanish")
 - su ("Sundanese")
 - sw ("Swahili")
 - sv ("Swedish")
 - tg ("Tajik")
 - ta ("Tamil")
 - tt ("Tatar")
 - te ("Telugu")
 - th ("Thai")
 - ti ("Tigrinya")
 - ts ("Tsonga")
 - tr ("Turkish")
 - tk ("Turkmen")
 - uk ("Ukrainian")
 - ur ("Urdu")
 - ug ("Uyghur")
 - uz ("Uzbek")
 - vi ("Vietnamese")
 - cy ("Welsh")
 - fy ("Western Frisian")
 - xh ("Xhosa")
 - yi ("Yiddish")
 - yo ("Yoruba")
 - zu ("Zulu")

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!