In [1]:
# Install required packages (Colab only)
!pip install aiohttp aiofiles pymupdf paddleocr bs4 nest-asyncio python-magic pytesseract -q
!sudo apt-get install tesseract-ocr tesseract-ocr-san tesseract-ocr-hin libmagic1 -y
!wget https://github.com/tesseract-ocr/tessdata/raw/main/san.traineddata -P /usr/share/tesseract-ocr/4.00/tessdata/

import os, aiohttp, aiofiles, asyncio, hashlib, re, json, fitz, magic, tempfile
from urllib.parse import urlparse, urljoin
from datetime import datetime
from bs4 import BeautifulSoup
from paddleocr import PaddleOCR
from google.colab import drive
import nest_asyncio
import pytesseract
from PIL import Image

# Setup
nest_asyncio.apply()
drive.mount("/content/drive")

# CONFIG
BASE_URLS = [
    "https://ayushportal.nic.in/default.aspx"
]
DOWNLOAD_DIR = "/content/drive/MyDrive/sanskritdocs/downloads"
OUTPUT_DIR = "/content/drive/MyDrive/sanskritdocs/outputs"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# CONSTANTS
DOC_REGEX = re.compile(r'.*\.(pdf|epub|docx?|txt)$', re.IGNORECASE)
HEADERS = {"User-Agent": "Mozilla/5.0"}
sem = asyncio.Semaphore(2)
mime = magic.Magic(mime=True)

# Initialize OCR engines
try:
    paddle_ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)  # English
except:
    paddle_ocr = None

class SanskritCrawler:
    def __init__(self):
        self.session = None
        self.visited = set()
        self.found_docs = set()

    def log(self, msg):
        print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")

    async def fetch(self, url):
        try:
            async with sem:
                async with self.session.get(url, headers=HEADERS, timeout=30) as r:
                    if r.status == 200 and 'html' in r.headers.get("Content-Type", ""):
                        return await r.text()
        except Exception as e:
            self.log(f"Fetch error: {url} - {str(e)}")
        return ""

    async def crawl(self, url, depth=2):
        if url in self.visited or depth <= 0:
            return
        self.visited.add(url)
        html = await self.fetch(url)
        if not html:
            return
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup.find_all('a', href=True):
            href = tag['href'].strip()
            full = urljoin(url, href)
            if DOC_REGEX.search(full):
                if full not in self.found_docs:
                    self.found_docs.add(full)
                    self.log(f"📄 Found: {full}")
            elif urlparse(full).netloc == urlparse(url).netloc:
                await self.crawl(full, depth - 1)

    async def download(self, url):
        fname = os.path.basename(urlparse(url).path)
        path = os.path.join(DOWNLOAD_DIR, fname)
        if os.path.exists(path):
            return path
        try:
            async with sem:
                async with self.session.get(url, timeout=60) as r:
                    if r.status == 200:
                        async with aiofiles.open(path, 'wb') as f:
                            await f.write(await r.read())
                        return path
        except Exception as e:
            self.log(f"❌ Download failed: {url} -> {e}")
        return ""

    def extract_text(self, path):
        try:
            # First try direct text extraction
            doc = fitz.open(path)
            full = "\n".join([p.get_text() for p in doc if p.get_text().strip()])
            if full.strip():
                return full

            self.log("🔍 No text found, using OCR")
            return self.run_ocr(path)
        except Exception as e:
            self.log(f"Text extraction error: {e}")
            return self.run_ocr(path)

    def run_ocr(self, path):
        text = ""
        try:
            doc = fitz.open(path)
            for pg in doc:
                pix = pg.get_pixmap(dpi=200)
                with tempfile.NamedTemporaryFile(suffix=".png") as tmp:
                    pix.save(tmp.name)

                    # Try PaddleOCR (English)
                    if paddle_ocr:
                        try:
                            paddle_result = paddle_ocr.ocr(tmp.name, cls=True)
                            if paddle_result and paddle_result[0]:
                                page_text = "\n".join([line[1][0] for line in paddle_result[0]])
                                text += page_text + "\n"
                                continue
                        except:
                            pass

                    # Fallback to Tesseract (Sanskrit)
                    img = Image.open(tmp.name)
                    try:
                        tesseract_text = pytesseract.image_to_string(img, lang='san')
                        text += tesseract_text + "\n"
                    except:
                        # Final fallback to English
                        tesseract_text = pytesseract.image_to_string(img, lang='eng')
                        text += tesseract_text + "\n"

        except Exception as e:
            self.log(f"OCR failed: {e}")
        return text.strip()

    def extract_meta(self, path, url):
        doc_id = hashlib.sha256(url.encode()).hexdigest()[:10]
        base = os.path.basename(path)
        try:
            doc = fitz.open(path)
            info = doc.metadata
        except:
            info = {}
        year = ""
        if info.get("creationDate", "").startswith("D:"):
            year = info["creationDate"][2:6]
        return {
            "site": urlparse(url).netloc,
            "document_id": doc_id,
            "title": info.get("title") or base,
            "authors": [info.get("author")] if info.get("author") else [],
            "pub_year": year,
            "language": "Sanskrit",
            "download_url": url,
            "scraped_at": datetime.utcnow().isoformat() + "Z",
            "checksum": self.sha256(path),
            "content": self.extract_text(path)
        }

    def sha256(self, path):
        h = hashlib.sha256()
        with open(path, 'rb') as f:
            for chunk in iter(lambda: f.read(4096), b""):
                h.update(chunk)
        return h.hexdigest()

    async def process_doc(self, url):
        path = await self.download(url)
        if not path:
            return
        meta = self.extract_meta(path, url)
        out_path = os.path.join(OUTPUT_DIR, f"{meta['document_id']}.json")
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(meta, f, indent=2, ensure_ascii=False)
        self.log(f"📝 Saved: {out_path}")

    async def run(self):
        async with aiohttp.ClientSession() as s:
            self.session = s
            self.log("🚀 Starting crawl...")
            await asyncio.gather(*(self.crawl(url, depth=2) for url in BASE_URLS))
            self.log(f"📚 Total found: {len(self.found_docs)}")
            await asyncio.gather(*(self.process_doc(url) for url in self.found_docs))
            self.log("✅ Done.")

# Run it
await SanskritCrawler().run()

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.7/68.7 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0

  paddle_ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)  # English


[06:06:46] 🚀 Starting crawl...
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Compendium_of_select_research_publications_on_the_Ayush_Interventions_for_COVID-19.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Research_and_Development_Initiatives_of_Ministry_of_Ayush_for_COVID-19.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/China's_Policy_Initiatives_fo_National_and_Global_Promotion_of_TCM.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/TKM_Study_Report.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Final_Dossier_03.01.2022_with_cover.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Concept_Note_with_cover_pages.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Ayurveda_and_Conventional_Medicine.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Document_Ayurveda.pdf
[06:06:51] 📄 Found: http://ayushportal.nic.in/pdf/Document_Nutritional.pdf
[06:06:51] 📄 Found: http://ccras.nic.in/sites/default/files/Notices/03112021_CCRAS_Post_Doctoral_Fellowsh