In [None]:
!pip install yt-dlp
!pip install faster-whisper
!pip install fpdf
!pip install requests
!pip install pdfplumber python-docx
!pip install pyngrok aiohttp nest_asyncio flask flask-cors
!apt-get install fonts-dejavu-core ffmpeg

In [None]:
import os
import json
import requests
import textwrap
import glob
import time
import asyncio
import aiohttp
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import concurrent.futures

import yt_dlp  # using yt-dlp for audio download
from faster_whisper import WhisperModel
from fpdf import FPDF

from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok

import nest_asyncio
nest_asyncio.apply()

# ===============================
# Configuration
# ===============================
API_KEY = "open-routeriapitoken"  # Replace with your actual key
CHUNK_SIZE = 10000
# ===============================
# Global Prompt for Summarization
# ===============================
SYSTEM_PROMPT = (
    "You are an AI assistant that summarizes and structures information from multiple sources "
    "(YouTube videos, textbooks, and documents) into an easy-to-read format.\n\n"
    "Given an input (a transcript, textbook chapter, or document), generate a structured summary that "
    "scales with the length of the input. The format should be:\n\n"
    "1. **Title & Metadata**\n"
    "   - Source Type: (Video, Book, or Document)\n"
    "   - Title: [Extracted from input]\n"
    "   - Author/Speaker: [If available]\n"
    "   - Date: [If available]\n"
    "   - Duration/Page Count: [Estimated from input]\n\n"
    "2. **Quick Summary** (Short and concise for small inputs, more detailed for long inputs).\n\n"
    "3. **Structured Breakdown:**\n"
    "   - For videos: Timeline-based summary with timestamps.\n"
    "   - For books/documents: Section-based summary with chapter titles and key points.\n\n"
    "4. **Key Concepts & Definitions** (Table format).\n\n"
    "5. **Self-Check Questions** (3-5 questions, scaling with content depth).\n\n"
    "6. **Actionable Insights** (What to do & common mistakes to avoid).\n\n"
    "7. **Further Learning** (Related videos, books, and resources).\n\n"
    "Adjust the depth of explanation, number of bullet points, and breakdown sections proportionally "
    "to the length of the input. Ensure clarity, conciseness, and logical organization."
)


In [None]:
# ===============================
# Helper Functions for Different Input Types
# ===============================
def download_audio(youtube_url):
    """
    Downloads the best audio from a YouTube URL using yt-dlp.
    Cleans the URL and uses a cookies file (cookies.txt) if available.
    Returns the downloaded file path.
    """
    parsed = urlparse(youtube_url)
    query_params = parse_qs(parsed.query)
    if 'v' in query_params:
        clean_query = {'v': query_params['v'][0]}
        parsed = parsed._replace(query=urlencode(clean_query))
        youtube_url = urlunparse(parsed)
        print("Cleaned URL:", youtube_url)

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'audio.%(ext)s',
        'quiet': False
    }
    # Use cookies if a cookies.txt file is present
    cookies_file = "cookies.txt"
    if os.path.exists(cookies_file):
        ydl_opts["cookiefile"] = cookies_file
        print("Using cookies file:", cookies_file)

    print("Downloading audio from:", youtube_url)
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([youtube_url])
        except Exception as e:
            print("yt-dlp encountered an error:", e)

    files = glob.glob("audio.*")
    if files:
        return files[0]
    else:
        raise FileNotFoundError("No audio file found after download.")

def transcribe_audio(audio_path):
    """
    Transcribes audio using faster-whisper on GPU.
    """
    model = WhisperModel("base", device="cuda")
    segments, _ = model.transcribe(audio_path)
    transcript = "\n".join([segment.text for segment in segments])
    return transcript

def extract_text_from_pdf(pdf_path):
    import pdfplumber
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text += text + "\n"
    return all_text

def extract_text_from_docx(docx_path):
    import docx
    doc = docx.Document(docx_path)
    full_text = [para.text for para in doc.paragraphs]
    return "\n".join(full_text)

def chunk_text(text, max_chars=CHUNK_SIZE):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunks.append(text[start:end])
        start = end
    return chunks


In [None]:
# ===============================
# Asynchronous Summarization Functions
# ===============================
async def async_summarize_text(session, text):
    payload = {
        "model": "deepseek/deepseek-r1-distill-llama-70b:free",
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": text}
        ],
    }
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://www.kaggle.com",
        "X-Title": "Multi-Source Summarizer"
    }
    async with session.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers) as resp:
        resp_json = await resp.json()
        print("Async LLM API Response:", resp_json)
        return resp_json.get("choices", [{}])[0].get("message", {}).get("content", "")

async def parallel_summarize(chunks):
    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(async_summarize_text(session, chunk)) for chunk in chunks]
        results = await asyncio.gather(*tasks)
    return results


In [None]:
# ===============================
# Synchronous Summarization Function
# ===============================
def summarize_text_sync(text):
    payload = {
        "model": "deepseek/deepseek-r1-distill-llama-70b:free",
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": text}
        ],
    }
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://www.kaggle.com",
        "X-Title": "Multi-Source Summarizer"
    }
    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, data=json.dumps(payload))
    print("LLM API Response:", response.json())
    return response.json().get("choices", [{}])[0].get("message", {}).get("content", "")

def save_as_pdf(text, filename="YouTube_Notes.pdf"):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    try:
        pdf.add_font('DejaVu', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
        pdf.set_font('DejaVu', '', 12)
    except RuntimeError:
        pdf.set_font("Arial", size=12)
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, line)
    pdf.output(filename, "F")
    print(f"✅ PDF saved as {filename}")
    return filename

def save_as_markdown(text, filename="YouTube_Notes.md"):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(text)
    print(f"✅ Markdown file saved as {filename}")
    return filename


In [None]:
# ===============================
# Main Modular Pipeline Function
# ===============================
def modular_pipeline(input_source, input_type):
    timings = {}
    start_total = time.time()

    if input_type.lower() == "youtube":
        start_download = time.time()
        audio_path = download_audio(input_source)
        end_download = time.time()
        timings["download_audio"] = end_download - start_download

        start_transcribe = time.time()
        extracted_text = transcribe_audio(audio_path)
        end_transcribe = time.time()
        timings["transcription"] = end_transcribe - start_transcribe

    elif input_type.lower() == "pdf":
        start_pdf = time.time()
        extracted_text = extract_text_from_pdf(input_source)
        end_pdf = time.time()
        timings["pdf_extraction"] = end_pdf - start_pdf

    elif input_type.lower() == "docx":
        start_docx = time.time()
        extracted_text = extract_text_from_docx(input_source)
        end_docx = time.time()
        timings["docx_extraction"] = end_docx - start_docx

    elif input_type.lower() == "text":
        if os.path.exists(input_source):
            with open(input_source, "r", encoding="utf-8") as f:
                extracted_text = f.read()
        else:
            extracted_text = input_source
        timings["text_extraction"] = 0
    else:
        raise ValueError("Unsupported input type. Choose from 'youtube', 'pdf', 'docx', 'text'.")

    # Use asynchronous summarization for long inputs
    if len(extracted_text) > CHUNK_SIZE:
        chunks = chunk_text(extracted_text, max_chars=CHUNK_SIZE)
        start_chunks = time.time()
        chunk_summaries = asyncio.run(parallel_summarize(chunks))
        end_chunks = time.time()
        timings["chunked_summarization"] = end_chunks - start_chunks

        combined_summary = "\n\n".join(chunk_summaries)
        print("Combined Chunk Summary:\n", combined_summary)  # Debug print

        start_final = time.time()
        final_summary = summarize_text_sync(combined_summary)
        end_final = time.time()
        timings["final_summarization"] = end_final - start_final

        # Fallback: if final_summary is empty, use combined_summary directly
        if not final_summary.strip():
            print("Final summary empty, using combined summary as fallback.")
            final_summary = combined_summary
            timings["final_summarization"] = 0
    else:
        start_sum = time.time()
        final_summary = summarize_text_sync(extracted_text)
        end_sum = time.time()
        timings["summarization"] = end_sum - start_sum
        chunk_summaries = [final_summary]

    total_time = time.time() - start_total
    timings["total_time"] = total_time

    pdf_file = save_as_pdf(final_summary)
    md_file = save_as_markdown(final_summary)

    return {
        "final_summary": final_summary,
        "chunk_summaries": chunk_summaries,
        "extracted_text_length": len(extracted_text),
        "timings": timings,
        "pdf_file": pdf_file,
        "markdown_file": md_file
    }


In [None]:
# ===============================
# Flask App Integration
# ===============================
app = Flask(__name__)
CORS(app)  # Enable CORS for cross-origin requests

@app.route("/summarize", methods=["POST"])
def summarize_endpoint():
    # Check if the request is multipart/form-data (i.e. file upload)
    if request.content_type.startswith("multipart/form-data"):
        file = request.files.get("file")
        input_type = request.form.get("input_type") or "pdf"  # Default to pdf if not provided
        if not file:
            return jsonify({"error": "No file provided"}), 400
        # Save the uploaded file
        filename = file.filename
        file.save(filename)
        input_source = filename
    else:
        data = request.get_json()
        print("Received JSON:", data)
        if not data:
            return jsonify({"error": "No JSON data provided"}), 400
        input_type = data.get("input_type")
        input_source = data.get("input_source")
        if not input_type or not input_source:
            return jsonify({"error": "Missing required parameters: input_type and input_source"}), 400

    try:
        result = modular_pipeline(input_source, input_type)
        return jsonify(result)
    except Exception as e:
        return jsonify({"error": str(e)}), 500


In [None]:
# ===============================
# Run Flask with ngrok
# ===============================
public_url = ngrok.connect(5000)
print("Public URL:", public_url)
app.run(port=5000)

In [None]:
!ngrok authtoken ngrok_token