In [9]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [15]:
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation

def extract_from_pdf(file_path):
    reader = PdfReader(file_path)
    return " ".join(page.extract_text() for page in reader.pages if page.extract_text())

def extract_from_docx(file_path):
    doc = Document(file_path)
    return " ".join(p.text for p in doc.paragraphs)

def extract_from_pptx(file_path):
    prs = Presentation(file_path)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return " ".join(text)

def extract_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def extract_from_md(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()


In [20]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import ffmpeg
from pydub import AudioSegment
import speech_recognition as sr
from yt_dlp import YoutubeDL
import os

# Extract text from image
def extract_from_image(file_path):
    try:
       img = Image.open(file_path)
        img = img.convert("L")  # grayscale
        img = img.filter(ImageFilter.SHARPEN)
        text = pytesseract.image_to_string(img, config='--psm 6')
        return text.strip()
    except Exception as e:
        return f"Error extracting text from image: {e}"

# Extract text from audio/video
def extract_from_audio_video(file_path):
    try:
        audio_path = "temp_audio.wav"

        # Convert video to audio (if it's a video file)
        if file_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
            print("Extracting audio from video...")
            (
                ffmpeg
                .input(file_path)
                .output(audio_path, format='wav', acodec='pcm_s16le', ac=1, ar='16000')
                .overwrite_output()
                .run(quiet=True)
            )
            file_path = audio_path  # Update file_path

        # Transcribe audio
        print("Transcribing audio...")
        r = sr.Recognizer()
        with sr.AudioFile(file_path) as source:
            audio = r.record(source)
            text = r.recognize_google(audio)
        return text
    except Exception as e:
        return f"Error transcribing audio/video: {e}"

# Download YouTube video (optional)
def download_youtube_video(url, output_path="downloaded_video.mp4"):
    try:
        ydl_opts = {'outtmpl': output_path, 'format': 'mp4'}
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return output_path
    except Exception as e:
        return f"Error downloading YouTube video: {e}"


In [3]:
# --- FFmpeg sanity check ---
import importlib
import sys
import subprocess

def ensure_ffmpeg_python():
    """
    Ensures that the correct ffmpeg-python library is installed and not the broken 'ffmpeg' one.
    Automatically uninstalls the wrong version and installs the correct one if needed.
    """
    try:
        ffmpeg = importlib.import_module("ffmpeg")
        # Check if this module has the correct 'input' attribute
        if not hasattr(ffmpeg, "input"):
            print("⚠️ Detected incorrect 'ffmpeg' package. Fixing it now...")
            subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "ffmpeg"])
            subprocess.check_call([sys.executable, "-m", "pip", "install", "ffmpeg-python"])
            print("✅ Installed the correct ffmpeg-python package. Please restart the kernel.")
        else:
            print("✅ ffmpeg-python is correctly installed and ready to use.")
    except ModuleNotFoundError:
        print("⚙️ ffmpeg not found. Installing ffmpeg-python...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "ffmpeg-python"])
        print("✅ Installed ffmpeg-python successfully. Please restart the kernel.")

# Run the check automatically
ensure_ffmpeg_python()


⚠️ Detected incorrect 'ffmpeg' package. Fixing it now...
✅ Installed the correct ffmpeg-python package. Please restart the kernel.


In [14]:
import os
os.environ["PATH"] += os.pathsep + r"C:\ffmpeg\bin"

from yt_dlp import YoutubeDL
import ffmpeg
from pydub import AudioSegment
import speech_recognition as sr
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter

# -------------------------------------------------------------------
# 📘 --- TEXT EXTRACTION FROM DOCUMENTS ---
# -------------------------------------------------------------------
def extract_from_pdf(file_path):
    reader = PdfReader(file_path)
    return " ".join(page.extract_text() for page in reader.pages if page.extract_text())

def extract_from_docx(file_path):
    doc = Document(file_path)
    return " ".join(p.text for p in doc.paragraphs)

def extract_from_pptx(file_path):
    prs = Presentation(file_path)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return " ".join(text)

def extract_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def extract_from_md(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# -------------------------------------------------------------------
# 🖼️ --- IMAGE EXTRACTION ---
# -------------------------------------------------------------------
def extract_from_image(file_path):
    try:
        img = Image.open(file_path)
        img = img.convert("L")  # grayscale
        img = img.filter(ImageFilter.SHARPEN)
        text = pytesseract.image_to_string(img, config='--psm 6')
        return text.strip()
    except Exception as e:
        return f"Error extracting text from image: {e}"

# -------------------------------------------------------------------
# 🎧 --- AUDIO/VIDEO EXTRACTION ---
# -------------------------------------------------------------------
def extract_from_audio_video(file_path):
    """Extracts text from audio or video files using ffmpeg + SpeechRecognition."""
    try:
        audio_path = "temp_audio.wav"

        # Convert MP3 or video to WAV
        if file_path.lower().endswith(('.mp3', '.mp4', '.avi', '.mov', '.mkv')):
            print(f"🎬 Converting {os.path.basename(file_path)} to WAV...")
            (
                ffmpeg
                .input(file_path)
                .output(audio_path, format='wav', acodec='pcm_s16le', ac=1, ar='16000')
                .overwrite_output()
                .run(quiet=True)
            )
            file_path = audio_path

        # Transcribe the converted audio
        print("🎧 Transcribing audio...")
        r = sr.Recognizer()
        with sr.AudioFile(file_path) as source:
            audio = r.record(source)
            text = r.recognize_google(audio)
        return text

    except Exception as e:
        return f"Error transcribing audio/video: {e}"

# -------------------------------------------------------------------
# 🎥 --- YOUTUBE VIDEO HANDLING ---
# -------------------------------------------------------------------
def download_youtube_video(url, output_path="youtube_download.mp4"):
    """Download YouTube video using yt_dlp."""
    try:
        print(f"📥 Downloading YouTube video from: {url}")
        ydl_opts = {
            'outtmpl': output_path,
            'format': 'bestvideo+bestaudio/best',
            'quiet': True
        }
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print("✅ Download completed.")
        return output_path
    except Exception as e:
        return f"Error downloading YouTube video: {e}"

# -------------------------------------------------------------------
# 🔍 --- FILE TYPE DETECTION ---
# -------------------------------------------------------------------
def is_youtube_url(url):
    return "youtube.com" in url or "youtu.be" in url

# -------------------------------------------------------------------
# 🧠 --- UNIVERSAL TEXT EXTRACTOR ---
# -------------------------------------------------------------------
def extract_text_from_file(file_path):
    """Smart extractor for all file types including YouTube URLs."""
    if not os.path.exists(file_path) and not is_youtube_url(file_path):
        return f"Error: File not found at {file_path}"

    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.pdf':
        return extract_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_from_docx(file_path)
    elif file_extension == '.pptx':
        return extract_from_pptx(file_path)
    elif file_extension == '.txt':
        return extract_from_txt(file_path)
    elif file_extension == '.md':
        return extract_from_md(file_path)
    elif file_extension in ('.png', '.jpg', '.jpeg'):
        return extract_from_image(file_path)
    elif file_extension in ('.mp3', '.mp4', '.avi', '.mov', '.mkv'):
        return extract_from_audio_video(file_path)
    elif is_youtube_url(file_path):
        downloaded_video = download_youtube_video(file_path)
        if os.path.exists(downloaded_video):
            return extract_from_audio_video(downloaded_video)
        else:
            return f"Error: Could not download or process YouTube video from {file_path}"
    else:
        return f"Error: Unsupported file type: {file_extension}"

# -------------------------------------------------------------------
# 🧩 --- ADD TO KNOWLEDGE BASE ---
# -------------------------------------------------------------------
knowledge_base = {}

def add_to_knowledge_base(file_path):
    text = extract_text_from_file(file_path)
    knowledge_base[file_path] = text
    print(f"\n✅ Added to knowledge base: {os.path.basename(file_path)}")

# -------------------------------------------------------------------
# 🧠 --- EXAMPLES ---
# -------------------------------------------------------------------
add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\GenAI.pdf")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\Forest.docx")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\Music.txt")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\quote.png")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\VandeMataram.mp3")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\SnowGeese.mp4")

add_to_knowledge_base("https://youtu.be/domCDwp5u3I?si=h23FlnVK6yn_CcTc")



# Inspect the knowledge base after adding files
print("\nCurrent knowledge base:")
print(knowledge_base)

print("\n🧠 Current knowledge base:")
for k, v in knowledge_base.items():
    print(f"\n🔹 {k}:\n{v[:300]}...")



✅ Added to knowledge base: GenAI.pdf

✅ Added to knowledge base: Forest.docx

✅ Added to knowledge base: Music.txt

✅ Added to knowledge base: quote.png
🎬 Converting VandeMataram.mp3 to WAV...

✅ Added to knowledge base: VandeMataram.mp3
🎬 Converting SnowGeese.mp4 to WAV...

✅ Added to knowledge base: SnowGeese.mp4
📥 Downloading YouTube video from: https://youtu.be/domCDwp5u3I?si=h23FlnVK6yn_CcTc


         player = https://www.youtube.com/s/player/87644c66/player_ias.vflset/en_US/base.js
         n = yxgkK2O8ORfV0ZqDayq ; player = https://www.youtube.com/s/player/87644c66/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
ERROR: You have requested merging of multiple formats but ffmpeg is not installed. Aborting due to --abort-on-error



✅ Added to knowledge base: domCDwp5u3I?si=h23FlnVK6yn_CcTc

Current knowledge base:
{'C:\\Users\\sukanya das\\OneDrive\\Desktop\\CSE Project\\Multimodal Data Processing using GenAI\\data\\GenAI.pdf': 'Generative Artificial Intelligence (GenAI)\nGenerative Artificial Intelligence (GenAI) refers to a class of AI systems capable of creating new\ncontent, such as text, images, music, and code, by learning from existing data. These models use\nadvanced machine learning techniques, particularly deep learning, to understand patterns and\nstructures within data and then generate novel outputs.\nHow GenAI Works:\nGenAI models, like GPT (Generative Pre-trained Transformer) and DALL·E, are trained on massive\ndatasets. They use neural networks with billions of parameters to predict and generate data\nsequences. For instance, a text-based model predicts the next word in a sentence, while an image\nmodel generates pixels based on prompts.\nApplications of GenAI:\n- Content creation (articles, repo

In [13]:
print(knowledge_base)

{'C:\\Users\\sukanya das\\OneDrive\\Desktop\\CSE Project\\Multimodal Data Processing using GenAI\\data\\VandeMataram.mp3': "Error transcribing audio/video: module 'ffmpeg' has no attribute 'input'", 'C:\\Users\\sukanya das\\OneDrive\\Desktop\\CSE Project\\Multimodal Data Processing using GenAI\\data\\SnowGeese.mp4': "Error transcribing audio/video: module 'ffmpeg' has no attribute 'input'", 'https://youtu.be/domCDwp5u3I?si=h23FlnVK6yn_CcTc': 'Error: Could not download or process YouTube video from https://youtu.be/domCDwp5u3I?si=h23FlnVK6yn_CcTc'}


In [1]:
import os
import ffmpeg
import pytesseract
from PIL import Image, ImageFilter
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
from pydub import AudioSegment
import speech_recognition as sr
from yt_dlp import YoutubeDL
import shutil

# --- Ensure ffmpeg is available system-wide ---
if not shutil.which("ffmpeg"):
    ffmpeg_path = r"C:\ffmpeg\bin"
    os.environ["PATH"] += os.pathsep + ffmpeg_path
    print(f"⚙️ Added FFmpeg to PATH: {ffmpeg_path}")
else:
    print(f"✅ FFmpeg found at: {shutil.which('ffmpeg')}")

# --- Extraction functions for different file types ---

def extract_from_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
    except Exception as e:
        return f"Error extracting text from PDF: {e}"

def extract_from_docx(file_path):
    try:
        doc = Document(file_path)
        return " ".join(p.text for p in doc.paragraphs)
    except Exception as e:
        return f"Error extracting text from DOCX: {e}"

def extract_from_pptx(file_path):
    try:
        prs = Presentation(file_path)
        text = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text.append(shape.text)
        return " ".join(text)
    except Exception as e:
        return f"Error extracting text from PPTX: {e}"

def extract_from_txt(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        return f"Error reading TXT file: {e}"

def extract_from_md(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        return f"Error reading MD file: {e}"

def extract_from_image(file_path):
    """Improved OCR with preprocessing."""
    try:
        img = Image.open(file_path).convert("L")
        img = img.filter(ImageFilter.SHARPEN)
        text = pytesseract.image_to_string(img, config='--psm 6')
        return text.strip() if text.strip() else "⚠️ No readable text detected in image."
    except Exception as e:
        return f"Error extracting text from image: {e}"

def extract_from_audio_video(file_path):
    """Handles both audio (.mp3, .wav) and video (.mp4, etc.)"""
    try:
        audio_path = "temp_audio.wav"

        # If it's a video — extract audio first
        if file_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
            print(f"🎬 Extracting audio from video: {file_path}")
            (
                ffmpeg
                .input(file_path)
                .output(audio_path, format='wav', acodec='pcm_s16le', ac=1, ar='16000')
                .overwrite_output()
                .run(quiet=True)
            )
            file_path = audio_path

        elif file_path.lower().endswith(('.mp3', '.m4a', '.ogg')):
            print(f"🎧 Converting audio to WAV: {file_path}")
            sound = AudioSegment.from_file(file_path)
            sound.export(audio_path, format="wav")
            file_path = audio_path

        # Transcribe the final WAV file
        r = sr.Recognizer()
        with sr.AudioFile(file_path) as source:
            audio = r.record(source)
            text = r.recognize_google(audio)
        return text

    except Exception as e:
        return f"Error transcribing audio/video: {e}"

def download_youtube_video(url, output_path="downloaded_video.mp4"):
    """Downloads and merges YouTube audio+video with proper ffmpeg linkage."""
    try:
        print(f"📥 Downloading YouTube video from: {url}")
        ydl_opts = {
            'outtmpl': output_path,
            'format': 'bestvideo+bestaudio/best',
            'merge_output_format': 'mp4',
            'ffmpeg_location': shutil.which("ffmpeg") or r"C:\ffmpeg\bin",
            'quiet': True
        }
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print(f"✅ Download complete: {output_path}")
        return output_path
    except Exception as e:
        return f"Error downloading YouTube video: {e}"

# --- Helper to detect YouTube URLs ---
def is_youtube_url(url):
    return "youtube.com" in url or "youtu.be" in url

# --- Main extraction dispatcher ---
def extract_text_from_file(file_path):
    if not os.path.exists(file_path) and not is_youtube_url(file_path):
        return f"Error: File not found at {file_path}"

    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.pdf':
        return extract_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_from_docx(file_path)
    elif file_extension == '.pptx':
        return extract_from_pptx(file_path)
    elif file_extension == '.txt':
        return extract_from_txt(file_path)
    elif file_extension == '.md':
        return extract_from_md(file_path)
    elif file_extension in ('.png', '.jpg', '.jpeg'):
        return extract_from_image(file_path)
    elif file_extension in ('.mp3', '.mp4', '.avi', '.mov', '.mkv', '.m4a'):
        return extract_from_audio_video(file_path)
    elif is_youtube_url(file_path):
        video_path = download_youtube_video(file_path)
        if os.path.exists(video_path):
            return extract_from_audio_video(video_path)
        return f"Error: Could not download or process YouTube video from {file_path}"
    else:
        return f"Error: Unsupported file type: {file_extension}"




⚙️ Added FFmpeg to PATH: C:\ffmpeg\bin


In [10]:
knowledge_base.clear()

In [2]:
!ffmpeg -version


'ffmpeg' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
import shutil
print(shutil.which("ffmpeg"))


None


In [32]:
!pip uninstall ffmpeg -y
!pip install ffmpeg-python




Defaulting to user installation because normal site-packages is not writeable


## Creating a function that can handle all supported file types:

In [4]:
import os

def extract_text_from_file(file_path):
    """
    Extracts text from various file types.

    Args:
        file_path (str): The path to the input file.

    Returns:
        str: The extracted text, or an error message if extraction fails or the file type is not supported.
    """
    if not os.path.exists(file_path):
        return f"Error: File not found at {file_path}"

    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.pdf':
        return extract_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_from_docx(file_path)
    elif file_extension == '.pptx':
        return extract_from_pptx(file_path)
    elif file_extension == '.txt':
        return extract_from_txt(file_path)
    elif file_extension == '.md':
        return extract_from_md(file_path)
    elif file_extension in ('.png', '.jpg', '.jpeg'):
        return extract_from_image(file_path)
    elif file_extension in ('.mp3', '.mp4', '.avi', '.mov', '.mkv'):
        return extract_from_audio_video(file_path)
    # Add handling for YouTube URLs if needed
    elif is_youtube_url(file_path):
        video_url = download_youtube_video(file_path)
        if video_url:
            return extract_from_audio_video(video_url)
        else:
            return f"Error: Could not download YouTube video from {file_path}"
    else:
        return f"Error: Unsupported file type: {file_extension}"

In [5]:
# Add example files to the knowledge base
# Replace these placeholder paths with the actual paths to your files


add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\GenAI.pdf")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\Forest.docx")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\Music.txt")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\quote.png")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\VandeMataram.mp3")

add_to_knowledge_base(r"C:\Users\sukanya das\OneDrive\Desktop\CSE Project\Multimodal Data Processing using GenAI\data\SnowGeese.mp4")

add_to_knowledge_base("https://youtu.be/domCDwp5u3I?si=h23FlnVK6yn_CcTc")

print("Example calls to add_to_knowledge_base have been added. Please uncomment and update the file paths to add your files.")

# Inspect the knowledge base after adding files
print("\nCurrent knowledge base:")
print(knowledge_base)

NameError: name 'add_to_knowledge_base' is not defined

## Creating a function that uses the Gemini model to answer questions based on the knowledge base:

In [6]:
from dotenv import load_dotenv
import os, google.generativeai as genai

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))




In [7]:
def answer_question_with_gemini(query):
    """
    Answers a natural language query using the Gemini model and the knowledge base.

    Args:
        query (str): The user's query.

    Returns:
        str: The answer generated by the Gemini model, or an informative message if no relevant information is found.
    """
    # 1. Search the knowledge base for relevant information
    relevant_results = search_knowledge_base(query)

    if not relevant_results:
        return "I couldn't find any relevant information in the knowledge base to answer your question."

    # 2. Prepare the context for the Gemini model
    context = ""
    for file_path, text in relevant_results.items():
        context += f"Information from {file_path}:\n{text}\n\n"

    # 3. Formulate the prompt for the Gemini model
    prompt = f"""Using the following information, answer the user's question.
If you cannot answer the question based on the provided information, please state that you cannot find the answer in the knowledge base.

Information:
{context}

User's question:
{query}

Answer:
"""

    # 4. Get the answer from the Gemini model
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating response from Gemini model: {e}"

# Example usage (you'll need to have added some files to the knowledge_base first)
# user_question = "What is the main topic of the PDF file?"
# gemini_answer = answer_question_with_gemini(user_question)
# print(gemini_answer)

## Creating a dictionary to store the extracted text and a function for basic searching:

In [8]:
# Dictionary to store extracted text: {file_path: extracted_text}
knowledge_base = {}

def add_to_knowledge_base(file_path):
    """Extracts text from a file and adds it to the knowledge base."""
    extracted_text = extract_text_from_file(file_path)
    knowledge_base[file_path] = extracted_text
    print(f"Processed and added {file_path} to the knowledge base.")

def search_knowledge_base(query):
    """Performs a simple keyword search within the knowledge base."""
    results = {}
    for file_path, text in knowledge_base.items():
        if query.lower() in text.lower():
            results[file_path] = text # Store the whole text for now, can refine later to show snippets
    return results

# Example usage (you'll need to replace 'path/to/your/file.pdf' with actual file paths)
# add_to_knowledge_base('path/to/your/file1.pdf')
# add_to_knowledge_base('path/to/your/file2.txt')
#
# search_query = "your search term"
# search_results = search_knowledge_base(search_query)
#
# if search_results:
#     print(f"Found results for '{search_query}':")
#     for file_path, text in search_results.items():
#         print(f"- {file_path}")
#         # print(f"  Snippet: {text[:200]}...") # Uncomment to show snippets
# else:
#     print(f"No results found for '{search_query}'.")

## Creating a dictionary to store the extracted text and a function for basic searching:

In [9]:
# Dictionary to store extracted text: {file_path: extracted_text}
knowledge_base = {}

def add_to_knowledge_base(file_path):
    """Extracts text from a file and adds it to the knowledge base."""
    extracted_text = extract_text_from_file(file_path)
    knowledge_base[file_path] = extracted_text
    print(f"Processed and added {file_path} to the knowledge base.")

def search_knowledge_base(query):
    """Performs a simple keyword search within the knowledge base."""
    results = {}
    for file_path, text in knowledge_base.items():
        if query.lower() in text.lower():
            results[file_path] = text # Store the whole text for now, can refine later to show snippets
    return results

# Example usage (you'll need to replace 'path/to/your/file.pdf' with actual file paths)
# add_to_knowledge_base('path/to/your/file1.pdf')
# add_to_knowledge_base('path/to/your/file2.txt')
#
# search_query = "your search term"
# search_results = search_knowledge_base(search_query)
#
# if search_results:
#     print(f"Found results for '{search_query}':")
#     for file_path, text in search_results.items():
#         print(f"- {file_path}")
#         # print(f"  Snippet: {text[:200]}...") # Uncomment to show snippets
# else:
#     print(f"No results found for '{search_query}'.")

##  Creating a function that uses the Gemini model to answer questions based on the knowledge base:

In [10]:
def answer_question_with_gemini(query):
    """
    Answers a natural language query using the Gemini model and the knowledge base.

    Args:
        query (str): The user's query.

    Returns:
        str: The answer generated by the Gemini model, or an informative message if no relevant information is found.
    """
    # 1. Search the knowledge base for relevant information
    relevant_results = search_knowledge_base(query)

    if not relevant_results:
        return "I couldn't find any relevant information in the knowledge base to answer your question."

    # 2. Prepare the context for the Gemini model
    context = ""
    for file_path, text in relevant_results.items():
        context += f"Information from {file_path}:\n{text}\n\n"

    # 3. Formulate the prompt for the Gemini model
    prompt = f"""Using the following information, answer the user's question.
If you cannot answer the question based on the provided information, please state that you cannot find the answer in the knowledge base.

Information:
{context}

User's question:
{query}

Answer:
"""

    # 4. Get the answer from the Gemini model
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating response from Gemini model: {e}"

# Example usage (you'll need to have added some files to the knowledge_base first)
# user_question = "What is the main topic of the PDF file?"
# gemini_answer = answer_question_with_gemini(user_question)
# print(gemini_answer)

In [3]:
%pip install python-dotenv


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install google-generativeai

Defaulting to user installation because normal site-packages is not writeable
Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.185.0-py3-none-any.whl.metadata (7.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core->google-generativeai)
  Downloading googleapis_common_protos-1.71.0-py3-none-any.whl.metadata (9.4 kB)
Collecting grpcio<2.0.0,>=1.33.2 (fro

In [6]:
with open(".env", "w", encoding="utf-8") as f:
    f.write("GOOGLE_API_KEY=AIzaSyCi99NkTULytK4F4NKRyQDbfyVkyv-Dgso\n")


In [11]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load your .env file (optional if key is already set)
load_dotenv()

# Or set the API key manually
os.environ["GOOGLE_API_KEY"] = "AIzaSyCi99NkTULytK4F4NKRyQDbfyVkyv-Dgso"

# Configure Gemini
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Use a supported model (from your list)
model = genai.GenerativeModel("gemini-2.5-flash")

# Test
response = model.generate_content("Hello Gemini 2.5, are you working?")
print(response.text)


Hello! Yes, I am here and ready to assist you. What can I help you with today?


In [8]:
!pip install -U google-generativeai


Defaulting to user installation because normal site-packages is not writeable


In [9]:
for m in genai.list_models():
    print(m.name)


models/embedding-gecko-001
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/learnlm-2.0-flash-experimental
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
mo

In [13]:
!pip install yt-dlp


Defaulting to user installation because normal site-packages is not writeable
Collecting yt-dlp
  Downloading yt_dlp-2025.10.22-py3-none-any.whl.metadata (176 kB)
Downloading yt_dlp-2025.10.22-py3-none-any.whl (3.2 MB)
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   --- ------------------------------------ 0.3/3.2 MB ? eta -:--:--
   ------ --------------------------------- 0.5/3.2 MB 2.1 MB/s eta 0:00:02
   ------------ --------------------------- 1.0/3.2 MB 1.6 MB/s eta 0:00:02
   ---------------- ----------------------- 1.3/3.2 MB 1.5 MB/s eta 0:00:02
   ---------------- ----------------------- 1.3/3.2 MB 1.5 MB/s eta 0:00:02
   ---------------------- ----------------- 1.8/3.2 MB 1.4 MB/s eta 0:00:02
   ------------------------- -------------- 2.1/3.2 MB 1.4 MB/s eta 0:00:01
   ----------------------------- ---------- 2.4/3.2 MB 1.4 MB/s eta 0:00:01
   -------------------------------- ------- 2.6/3.2 MB 1.3 MB/s eta 0:00:01
   --------------------------------

In [4]:
!pip install moviepy --upgrade


Defaulting to user installation because normal site-packages is not writeable


In [3]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "moviepy"])


0

In [4]:
import os
from yt_dlp import YoutubeDL
import moviepy.editor as mp
import speech_recognition as sr

def download_youtube_video(url, output_path="downloaded_video.mp4"):
    try:
        ydl_opts = {
            'format': 'best',
            'outtmpl': output_path
        }
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return output_path
    except Exception as e:
        return f"Error downloading YouTube video: {e}"

def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
    try:
        video = mp.VideoFileClip(video_path)
        video.audio.write_audiofile(audio_path)
        return audio_path
    except Exception as e:
        return f"Error extracting audio: {e}"

def transcribe_audio(audio_path):
    try:
        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
        text = recognizer.recognize_google(audio)
        return text
    except Exception as e:
        return f"Error transcribing audio: {e}"

def youtube_to_text(url):
    video_path = download_youtube_video(url)
    if not os.path.exists(video_path):
        return video_path  # Return error message

    audio_path = extract_audio_from_video(video_path)
    if not os.path.exists(audio_path):
        return audio_path  # Return error message

    transcript = transcribe_audio(audio_path)

    # Optional cleanup
    os.remove(audio_path)

    return transcript


ModuleNotFoundError: No module named 'moviepy.editor'

In [3]:
url = "https://youtu.be/sljF4t0nOk0?si=lOOvA7aYdz6DcL5N"
text = youtube_to_text(url)
print(text)


NameError: name 'youtube_to_text' is not defined