In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
from nltk.tokenize import sent_tokenize
import fitz
import re

def write_sentences_to_file(sentences, filename="Extract_Content.txt"):
    try:
        with open(filename, 'w+') as file:
            for sentence in sentences:
                file.write(sentence + "\n")
        print(f"Sentences successfully written to {filename}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

def write_full_transcript_to_file(transcript_data, filename="Extract_Transcript.txt"):
    try:
        with open(filename, 'w+') as file:
            for item in transcript_data:
                start_time = item['start']
                duration = item['duration']
                end_time = start_time + duration
                text = item['text'].replace("\n", " ")
                file.write(f"[{start_time:.2f} - {end_time:.2f}] {text}\n")
        print(f"Full transcript successfully written to {filename}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

def get_youtube_data(video_url):
    nltk.download('punkt')
    video_id = video_url.split("?v=")[1]

    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    transcript = transcript_list.find_manually_created_transcript(['en'])
    transcript_data = transcript.fetch()        

    combined_text = " ".join([item['text'].replace("\n", " ") for item in transcript_data])
    sentences = sent_tokenize(combined_text)

    write_sentences_to_file(sentences=sentences)
    write_full_transcript_to_file(transcript_data=transcript_data)

def get_pdf_data(pdf_path="Paper.pdf", output_file_path="Extracted_Content_Pdf.txt"):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        page_text = page.get_text()
        page_text = re.sub(r'\n+', ' ', page_text)
        page_text = re.sub(r'\.{2,}', '', page_text)
        text += page_text
    pdf_document.close()

    try:
        tmp = text.split(" ")
        indx = tmp.index("References")
        text = " ".join(tmp[:indx])
    except: 
        pass

    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(text)

#Example Usage
# pdf_path = input("Enter Path : ")
# get_pdf_data(pdf_path=pdf_path)

#Example Usage
video_url = input("Enter URL : ")
get_youtube_data(video_url=video_url)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentences successfully written to Extract_Content.txt
Full transcript successfully written to Extract_Transcript.txt
