In [1]:
pip install langchain openai youtube-transcript-api faiss-cpu tiktoken

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, you

In [3]:
pip install langchain openai youtube-transcript-api faiss-cpu tiktoken langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [10]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
import openai

openai.api_key = "AIzaSyDdo3MTE27I-jbZ_jMQyPLkXqZ7X2NkBJc"

def extract_transcript(video_url, lang="ta"):
    try:
        video_id = video_url.split("v=")[-1].split("&")[0]
        # Fetch transcript in the required language (Tamil or English, based on availability)
        transcript_obj = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript([lang])

        # If found, fetch the transcript content
        transcript = transcript_obj.fetch()

        # Extract the text from the transcript object (the correct way to do it)
        text = " ".join([t['text'] for t in transcript])
        return text

    except NoTranscriptFound:
        print(f"❌ No transcript found for the video in {lang}.")
    except TranscriptsDisabled:
        print("❌ Transcripts are disabled for this video.")
    except Exception as e:
        print(f"⚠️ Error: {e}")
    return ""

def chunk_transcript(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = splitter.create_documents([text])
    return docs

def create_vectorstore(docs):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def build_retriever_qa(vectorstore):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        chain_type="stuff"
    )
    return qa_chain

def summarize_youtube_video(video_url, transcript_lang="ta"):
    print("Extracting transcript...")
    transcript_text = extract_transcript(video_url, lang=transcript_lang)

    if not transcript_text:
        return "Transcript not available."

    print("Chunking and embedding...")
    docs = chunk_transcript(transcript_text)
    vectorstore = create_vectorstore(docs)

    print("Building QA chain...")
    qa = build_retriever_qa(vectorstore)

    print("Generating summary...")
    query = "Summarize this video content in English."
    summary = qa.run(query)
    return summary

if __name__ == "__main__":
    url = input("Enter YouTube video URL: ")
    summary = summarize_youtube_video(url, transcript_lang="en")  # Change 'ta' if required
    print("\n🎯 Summary:\n", summary)


Enter YouTube video URL: https://www.youtube.com/watch?v=KrwEdjviujI
Extracting transcript...
⚠️ Error: 'FetchedTranscriptSnippet' object is not subscriptable

🎯 Summary:
 Transcript not available.


In [17]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import openai

openai.api_key = "AIzaSyDdo3MTE27I-jbZ_jMQyPLkXqZ7X2NkBJc"

def extract_transcript(video_url, lang="ta"):
    try:
        video_id = video_url.split("v=")[-1].split("&")[0]
        # Fetch transcript in the required language (Tamil or English, based on availability)
        transcript_obj = YouTubeTranscriptApi.list_transcripts(video_id).find_transcript([lang])

        # If found, fetch the transcript content
        transcript = transcript_obj.fetch()

        # Correct handling: Accessing the 'text' attribute from the FetchedTranscriptSnippet
        text = " ".join([t.text for t in transcript])  # Accessing text properly
        return text

    except NoTranscriptFound:
        print(f"❌ No transcript found for the video in {lang}.")
    except TranscriptsDisabled:
        print("❌ Transcripts are disabled for this video.")
    except Exception as e:
        print(f"⚠️ Error: {e}")
    return ""

def chunk_transcript(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = splitter.create_documents([text])
    return docs

def create_vectorstore(docs):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def build_retriever_qa(vectorstore):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        chain_type="stuff"
    )
    return qa_chain

def summarize_youtube_video(video_url, transcript_lang="ta"):
    print("Extracting transcript...")
    transcript_text = extract_transcript(video_url, lang=transcript_lang)

    if not transcript_text:
        return "Transcript not available."

    print("Chunking and embedding...")
    docs = chunk_transcript(transcript_text)
    vectorstore = create_vectorstore(docs)

    print("Building QA chain...")
    qa = build_retriever_qa(vectorstore)

    print("Generating summary...")
    query = "Summarize this video content in English."
    summary = qa.run(query)
    return summary

if __name__ == "__main__":
    url = input("Enter YouTube video URL: ")
    summary = summarize_youtube_video(url, transcript_lang="ta")  # Change 'ta' if required
    print("\n🎯 Summary:\n", summary)


Enter YouTube video URL: https://www.youtube.com/watch?v=KrwEdjviujI
Extracting transcript...
❌ No transcript found for the video in ta.

🎯 Summary:
 Transcript not available.


In [18]:
pip install openai youtube-transcript-api transformers requests




In [23]:
import openai
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import requests

# Set up OpenAI API key
openai.api_key = 'AIzaSyDdo3MTE27I-jbZ_jMQyPLkXqZ7X2NkBJc'

# Function to fetch the transcript of the YouTube video
def get_video_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatter = TextFormatter()
        transcript_text = formatter.format_transcript(transcript)
        return transcript_text
    except Exception as e:
        print(f"Error fetching transcript: {e}")
        return None

# Function to generate summary using GPT (RAG-based)
def generate_summary_rag(video_id):
    transcript_text = get_video_transcript(video_id)

    if not transcript_text:
        return "Sorry, we couldn't retrieve the transcript."

    # Initialize RAG tokenizer and retriever
    tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
    retriever = RagRetriever.from_pretrained("facebook/rag-token-nq")

    # Tokenize and prepare the inputs for RAG
    inputs = tokenizer(transcript_text, return_tensors="pt", truncation=True, padding=True, max_length=1024)

    # Generate a summary using the model
    model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq")
    generated_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], num_beams=4, max_length=200)

    # Decode the generated summary
    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary

# Example Usage
video_id = 'v=Ad_TEk94B9Q'  # Replace with the YouTube video ID
summary = generate_summary_rag(video_id)
print("Summary:\n", summary)


Error fetching transcript: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=v=Ad_TEk94B9Q! This is most likely caused by:

The video is no longer available

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!
Summary:
 Sorry, we couldn't retrieve the transcript.


In [24]:
pip install openai requests




In [1]:
pip install biopython


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [3]:
import random
from Bio import Entrez

# Function to fetch PubMed articles
def fetch_pubmed_articles(query, max_records=5):
    Entrez.email = "ssiva281205@gmail.com"  # Provide your email address
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_records)
    record = Entrez.read(handle)
    id_list = record["IdList"]

    # Fetch the details for each article
    handle = Entrez.esummary(db="pubmed", id=",".join(id_list))
    summary = Entrez.read(handle)

    return summary

# Function to generate MCQs from article summaries
def generate_mcqs_from_articles(articles):
    mcqs = []

    for article in articles:
        # Debug print to check available keys in the article dictionary
        print(f"Available keys in article: {article.keys()}")

        title = article.get("Title", "No Title")
        source = article.get("Source", "No Source")
        pub_date = article.get("PubDate", "No Date")
        pmid = article.get("Id", "No PubMedId")
        summary = article.get("SortDate", "No Summary")

        # Extract question and multiple choices
        question = f"What is the primary focus of the article titled '{title}'?"

        # Simulate answers (can be improved with NLP for real answer extraction)
        correct_answer = f"The article discusses {summary}"
        wrong_answers = [
            "The article focuses on a different medical field.",
            "The article is a case study on health economics.",
            "The article reviews recent technological advances in medicine."
        ]

        # Shuffle answers
        answers = [correct_answer] + wrong_answers
        random.shuffle(answers)

        # Create MCQ object
        mcq = {
            "question": question,
            "options": answers,
            "correct_answer": correct_answer
        }
        mcqs.append(mcq)

    return mcqs

# Example function to display MCQs
def display_mcqs(mcqs):
    for idx, mcq in enumerate(mcqs, 1):
        print(f"Q{idx}: {mcq['question']}")
        for i, option in enumerate(mcq['options'], 1):
            print(f"{i}. {option}")
        print(f"Correct Answer: {mcq['correct_answer']}")
        print()

# Main execution
def main():
    query = "cardiology"  # Change to any medical topic of interest
    articles = fetch_pubmed_articles(query)
    mcqs = generate_mcqs_from_articles(articles)
    display_mcqs(mcqs)

if __name__ == "__main__":
    main()


Available keys in article: dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstract', 'PmcRefCount', 'FullJournalName', 'ELocationID', 'SO'])
Available keys in article: dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstract', 'PmcRefCount', 'FullJournalName', 'ELocationID', 'SO'])
Available keys in article: dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstr

In [5]:
!pip install biopython


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.3 MB[0m [31m94.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [6]:
import random
from Bio import Entrez

# Function to fetch PubMed articles
def fetch_pubmed_articles(query, max_records=5):
    Entrez.email = "your-email@example.com"  # Provide your email address
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_records)
    record = Entrez.read(handle)
    id_list = record["IdList"]

    # Fetch the details for each article
    handle = Entrez.esummary(db="pubmed", id=",".join(id_list))
    summary = Entrez.read(handle)

    return summary

# Function to generate MCQs from article summaries
def generate_mcqs_from_articles(articles):
    mcqs = []

    for article in articles:
        # Debug print to check available keys in the article dictionary
        print(f"Available keys in article: {article.keys()}")

        title = article.get("Title", "No Title")
        source = article.get("Source", "No Source")
        pub_date = article.get("PubDate", "No Date")
        pmid = article.get("Id", "No PubMedId")
        summary = article.get("SortDate", "No Summary")

        # Extract question and multiple choices
        question = f"What is the primary focus of the article titled '{title}'?"

        # Simulate answers (can be improved with NLP for real answer extraction)
        correct_answer = f"The article discusses {summary}"
        wrong_answers = [
            "The article focuses on a different medical field.",
            "The article is a case study on health economics.",
            "The article reviews recent technological advances in medicine."
        ]

        # Shuffle answers
        answers = [correct_answer] + wrong_answers
        random.shuffle(answers)

        # Create MCQ object
        mcq = {
            "question": question,
            "options": answers,
            "correct_answer": correct_answer
        }
        mcqs.append(mcq)

    return mcqs

# Example function to display MCQs and get feedback
def ask_mcqs(mcqs):
    score = 0

    for idx, mcq in enumerate(mcqs, 1):
        print(f"Q{idx}: {mcq['question']}")

        # Display options
        for i, option in enumerate(mcq['options'], 1):
            print(f"{i}. {option}")

        # Prompt user for an answer
        try:
            user_answer = int(input("Choose the correct option (1-4): "))

            if user_answer < 1 or user_answer > 4:
                print("Invalid option! Please choose a number between 1 and 4.")
                continue

            # Check if the answer is correct
            if mcq['options'][user_answer - 1] == mcq['correct_answer']:
                print("Correct!\n")
                score += 1
            else:
                print(f"Wrong! The correct answer was: {mcq['correct_answer']}\n")
        except ValueError:
            print("Invalid input! Please enter a number between 1 and 4.\n")

    print(f"Your score: {score}/{len(mcqs)}")

# Main execution
def main():
    query = "cardiology"  # Change to any medical topic of interest
    articles = fetch_pubmed_articles(query)
    mcqs = generate_mcqs_from_articles(articles)
    ask_mcqs(mcqs)

if __name__ == "__main__":
    main()
67


Available keys in article: dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstract', 'PmcRefCount', 'FullJournalName', 'ELocationID', 'SO'])
Available keys in article: dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstract', 'PmcRefCount', 'FullJournalName', 'ELocationID', 'SO'])
Available keys in article: dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstr

In [8]:
pip install "pip<24.1"


Collecting pip<24.1
  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m119.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.0


In [9]:
pip install textract


Collecting textract
  Using cached textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
Collecting argcomplete~=1.10.0 (from textract)
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl.metadata (16 kB)
Collecting beautifulsoup4~=4.8.0 (from textract)
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl.metadata (4.1 kB)
Collecting chardet==3.* (from textract)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting docx2txt~=0.8 (from textract)
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting extract-msg<=0.29.* (from textract)
  Downloading extract_msg-0.28.7-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting pdfminer.six==20191110 (from textract)
  Downloading pdfminer.six-20191110-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting python-pptx~=0.6.18 (from textract)
  Downloading python_pptx-0.6.23-py3-none-any.whl.metadata (18 kB)
Collecting six~=1.12.0 (from textract)
  Downloading six-1.12.0-py2.py3-none-any.whl.metadata (1.9 kB)
Coll

In [10]:
pip install python-docx pymupdf


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25h[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m


In [11]:
def extract_text_from_file(filepath):
    import os
    ext = os.path.splitext(filepath)[1].lower()

    if ext == '.pdf':
        import fitz  # PyMuPDF
        doc = fitz.open(filepath)
        return "\n".join([page.get_text() for page in doc])

    elif ext == '.docx':
        import docx
        doc = docx.Document(filepath)
        return "\n".join([para.text for para in doc.paragraphs])

    elif ext == '.txt':
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()

    elif ext in ['.mp3', '.wav', '.m4a']:
        import whisper
        model = whisper.load_model("base")
        result = model.transcribe(filepath)
        return result['text']

    elif ext in ['.mp4', '.mov', '.mkv']:
        from moviepy.editor import VideoFileClip
        audio_path = "temp_audio.wav"
        clip = VideoFileClip(filepath)
        clip.audio.write_audiofile(audio_path, logger=None)
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        os.remove(audio_path)
        return result['text']

    else:
        raise ValueError("Unsupported file type.")


In [13]:
!pip install git+https://github.com/openai/whisper.git


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ggm0ha2_
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-ggm0ha2_
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [14]:
!pip install python-docx pymupdf moviepy openai


[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [15]:
!apt install ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [16]:
import whisper
import openai
import fitz
import docx
from moviepy.editor import VideoFileClip


  if event.key is 'enter':



In [19]:
!pip install openai whisper git+https://github.com/openai/whisper.git python-docx pymupdf moviepy
!apt install ffmpeg


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-46b4ko2i
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-46b4ko2i
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting whisper
  Using cached whisper-1.1.10.tar.gz (42 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=b61dd0ae4f0db6e3242520788f4127971e23041acb5268aaa890d7f96b0e0485
  Stored in directory: /root/.cache/pip/wheels/21/65/ee/4e6672aabfa486d3341a39a04f8f87c77e5156149

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [20]:
from google.colab import files

uploaded = files.upload()

# Get the file path
filepath = list(uploaded.keys())[0]
print("Uploaded:", filepath)


Saving F3.jpeg to F3.jpeg
Uploaded: F3.jpeg


In [21]:
import os
import fitz  # PyMuPDF
import docx
import whisper
import openai
from moviepy.editor import VideoFileClip

openai.api_key = "AIzaSyDdo3MTE27I-jbZ_jMQyPLkXqZ7X2NkBJc"  # Replace with your API key

def extract_text_from_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    print(f"Detected extension: {ext}")

    if ext == '.pdf':
        doc = fitz.open(filepath)
        return "\n".join([page.get_text() for page in doc])

    elif ext == '.docx':
        doc = docx.Document(filepath)
        return "\n".join([para.text for para in doc.paragraphs])

    elif ext == '.txt':
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()

    elif ext in ['.mp3', '.wav', '.m4a']:
        model = whisper.load_model("base")
        result = model.transcribe(filepath)
        return result['text']

    elif ext in ['.mp4', '.mov', '.mkv']:
        audio_path = "temp_audio.wav"
        clip = VideoFileClip(filepath)
        clip.audio.write_audiofile(audio_path, logger=None)
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        os.remove(audio_path)
        return result['text']

    else:
        print("❌ Unsupported file type:", ext)
        return ""

def ask_question_about_text(text, question):
    prompt = f"Answer the question based on the following content:\n\n{text}\n\nQuestion: {question}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Or use gpt-4 if available
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=500
    )
    return response['choices'][0]['message']['content']


In [25]:
!apt install tesseract-ocr
!pip install pytesseract Pillow


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [26]:
def extract_text_from_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    print(f"Detected extension: {ext}")

    if ext == '.pdf':
        doc = fitz.open(filepath)
        return "\n".join([page.get_text() for page in doc])

    elif ext == '.docx':
        doc = docx.Document(filepath)
        return "\n".join([para.text for para in doc.paragraphs])

    elif ext == '.txt':
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()

    elif ext in ['.mp3', '.wav', '.m4a']:
        model = whisper.load_model("base")
        result = model.transcribe(filepath)
        return result['text']

    elif ext in ['.mp4', '.mov', '.mkv']:
        audio_path = "temp_audio.wav"
        clip = VideoFileClip(filepath)
        clip.audio.write_audiofile(audio_path, logger=None)
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        os.remove(audio_path)
        return result['text']

    elif ext in ['.jpg', '.jpeg', '.png']:
        from PIL import Image
        import pytesseract
        image = Image.open(filepath)
        return pytesseract.image_to_string(image)

    else:
        print("❌ Unsupported file type:", ext)
        return ""


In [28]:
from google.colab import files

uploaded = files.upload()

# Get the file path
filepath = list(uploaded.keys())[0]
print("Uploaded:", filepath)


Saving D1.jpeg to D1.jpeg
Uploaded: D1.jpeg


In [29]:
import os
import fitz  # PyMuPDF
import docx
import whisper
import openai
from moviepy.editor import VideoFileClip

openai.api_key = "YOUR_OPENAI_API_KEY"  # Replace with your API key

def extract_text_from_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    print(f"Detected extension: {ext}")

    if ext == '.pdf':
        doc = fitz.open(filepath)
        return "\n".join([page.get_text() for page in doc])

    elif ext == '.docx':
        doc = docx.Document(filepath)
        return "\n".join([para.text for para in doc.paragraphs])

    elif ext == '.txt':
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()

    elif ext in ['.mp3', '.wav', '.m4a']:
        model = whisper.load_model("base")
        result = model.transcribe(filepath)
        return result['text']

    elif ext in ['.mp4', '.mov', '.mkv']:
        audio_path = "temp_audio.wav"
        clip = VideoFileClip(filepath)
        clip.audio.write_audiofile(audio_path, logger=None)
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        os.remove(audio_path)
        return result['text']

    else:
        print("❌ Unsupported file type:", ext)
        return ""

def ask_question_about_text(text, question):
    prompt = f"Answer the question based on the following content:\n\n{text}\n\nQuestion: {question}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Or use gpt-4 if available
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=500
    )
    return response['choices'][0]['message']['content']


In [30]:
!apt install tesseract-ocr
!pip install pytesseract Pillow


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [31]:
def extract_text_from_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    print(f"Detected extension: {ext}")

    if ext == '.pdf':
        doc = fitz.open(filepath)
        return "\n".join([page.get_text() for page in doc])

    elif ext == '.docx':
        doc = docx.Document(filepath)
        return "\n".join([para.text for para in doc.paragraphs])

    elif ext == '.txt':
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()

    elif ext in ['.mp3', '.wav', '.m4a']:
        model = whisper.load_model("base")
        result = model.transcribe(filepath)
        return result['text']

    elif ext in ['.mp4', '.mov', '.mkv']:
        audio_path = "temp_audio.wav"
        clip = VideoFileClip(filepath)
        clip.audio.write_audiofile(audio_path, logger=None)
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        os.remove(audio_path)
        return result['text']

    elif ext in ['.jpg', '.jpeg', '.png']:
        from PIL import Image
        import pytesseract
        image = Image.open(filepath)
        return pytesseract.image_to_string(image)

    else:
        print("❌ Unsupported file type:", ext)
        return ""


In [32]:
from google.colab import files

uploaded = files.upload()

# Get the file path
filepath = list(uploaded.keys())[0]
print("Uploaded:", filepath)


Saving D1.jpeg to D1 (1).jpeg
Uploaded: D1 (1).jpeg


In [33]:
# Extract the content
content = extract_text_from_file(filepath)
if content:
    print("✅ Content Extracted.")
else:
    print("⚠️ Could not extract content. Please check the file format.")


Detected extension: .jpeg
✅ Content Extracted.
