## Data Harvesting & Structuring Assignment

#### Task-1: Crawl and Download

In [2]:
! pip install requests beautifulsoup4 urllib3 tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting urllib3
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.7.9-py3-none-any.whl.metadata (2.4 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 

In [28]:
target_urls = [
        "https://sanskritdocuments.org/scannedbooks/asisanskritpdfs.html",
        "https://sanskritdocuments.org/scannedbooks/asiallpdfs.html",
        "https://indianculture.gov.in/ebooks",
        "https://ignca.gov.in/divisionss/asi-books/",
        "https://archive.org/details/TFIC_ASI_Books/ACatalogueOfTheSamskritManuscriptsInTheAdyarLibraryPt.1/",
        "https://indianmanuscripts.com/",
        "https://niimh.nic.in/ebooks/ayuhandbook/index.php"
    ]

In [29]:
import os
import time
import random
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from tqdm import tqdm
#import webbrowser

In [51]:
# File formats to download
ALLOWED_EXTENSIONS = [".pdf", ".epub", ".html", ".htm"]

# Set headers with random project website
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ASIbot/1.0; +https://example.org/asi-crawler)" 
}

# Ensure base download folder exists
DOWNLOAD_FOLDER = "Downloads_Docs"
# Track PDF download count per domain
download_count_per_domain = {}
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

In [31]:
# Utility: Get domain's robots.txt rules
def can_fetch(url):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    robots_url = urljoin(base_url, "/robots.txt")

    rp = RobotFileParser()
    try:
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(HEADERS["User-Agent"], url)
    except:
        return False  # Default to no access if error in robots.txt

In [32]:
# Utility: Save file
def save_file(url, output_path):
    try:
        with requests.get(url, headers=HEADERS, stream=True, timeout=20) as r:
            r.raise_for_status()
            with open(output_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
    except Exception as e:
        print(f"[!] Failed to download {url}: {str(e)}")

In [61]:
# Recursive crawler and downloader
def crawl(url, visited, depth=0):
    if url in visited or depth > 5:
        return
    visited.add(url)

    if not can_fetch(url):
        print(f"[robots.txt] Skipping disallowed URL: {url}")
        return

    time.sleep(random.uniform(1, 2))  # Delay to respect server

    try:
        print(f"[+] Crawling: {url}")
        response = requests.get(url, headers=HEADERS, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            if any(full_url.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS):
                filename = os.path.basename(urlparse(full_url).path)
                file_dir = os.path.join(DOWNLOAD_FOLDER, urlparse(url).netloc.replace('.', '_'))
                os.makedirs(file_dir, exist_ok=True)
                filepath = os.path.join(file_dir, filename)

                if not os.path.exists(filepath):
                    print(f"[↓] Downloading: {full_url}")
                    save_file(full_url, filepath)
            elif urlparse(full_url).netloc == urlparse(url).netloc:
                crawl(full_url, visited, depth + 1)

    except Exception as e:
        print(f"[!] Error crawling {url}: {str(e)}")


In [None]:
#Takes taget url list and visited nodes as input and webscraped files from the given link
visited = set()
for url in target_urls:
    crawl(url, visited)

#### Task-2: Document Processing and JSON Creation

In [None]:
! pip install python-docx pypdf tika pytesseract pdf2image beautifulsoup4 lxml

Defaulting to user installation because normal site-packages is not writeable
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting tika
  Downloading tika-3.1.0-py3-none-any.whl.metadata (15 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting lxml
  Downloading lxml-6.0.0-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting setuptools (from tika)
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Collecting Pillow>=8.0.0 (from pytesseract)
  Downloading pillow-11.3.0-cp313-cp313-win_amd64.whl.metadata (9.2 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Downloading pypdf-5.8.0-py3-none-any.whl (309 kB)
Downloading tika-3.1.0-py3-none-any.whl (38 kB)
Downloading pytesseract-0.3.13-py3-none-any.wh

Sudo is disabled on this machine. To enable it, go to the ]8;;ms-settings:developers\Developer Settings page]8;;\ in the Settings app


In [28]:
! pip install PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import os #Used to run through downloaded files
import hashlib #Used to generate unique HASH code for documents
import pytesseract #Used to extract text from document
from datetime import datetime
from tika import parser
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
import json
from bs4 import BeautifulSoup

  __import__('pkg_resources').declare_namespace(__name__)


In [4]:
DOWNLOAD_FOLDER = "Downloads"
OUTPUT_JSON = "Metadata.json"
PROCESSED_HASHES = set()
#Set Tesseract in system PATH 
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [5]:
def compute_sha256(file_path):
    hash_sha = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_sha.update(chunk)
    return hash_sha.hexdigest()

In [6]:
def extract_text(file_path):
    try:
        parsed = parser.from_file(file_path)
        return parsed.get("content", "").strip()
    except:
        return ""

In [7]:
def ocr_pdf(file_path):
    try:
        #Set poppler path in system variables 
        images = convert_from_path(file_path, poppler_path=r"C:\poppler-24.08.0\Library\bin")
        config = '-l eng+hin+san'#To handle Hindi,English and Sanskrit Documents
        text = ""
        for img in images:
            text += pytesseract.image_to_string(img,config=config)
        return text
    except:
        return ""

In [8]:
def extract_pdf_metadata(file_path):
    try:
        reader = PdfReader(file_path)
        meta = reader.metadata or {}
        return {
            "title": meta.title or "",
            "authors": [meta.author] if meta.author else [],
            "pub_year": meta.get('/CreationDate', '')[2:6] if '/CreationDate' in meta else "",
            "language": meta.get('/Lang', '')
        }
    except:
        return {}

In [9]:
def construct_document_id(file_path, checksum):
    filename = os.path.basename(file_path)
    #Using checksum to create document ID to avoid duplicate names
    return f"{filename.split('.')[0]}_{checksum[:8]}"

def get_site_name(file_path):
    return os.path.basename(os.path.dirname(file_path)).replace('_', '.')


In [24]:
def process_file(file_path):
    checksum = compute_sha256(file_path)
    if checksum in PROCESSED_HASHES:
        return None  # Skip already processed file
    PROCESSED_HASHES.add(checksum)

    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        metadata = extract_pdf_metadata(file_path)
        content = extract_text(file_path)
        print(content)
        if not content.strip():
            content = ocr_pdf(file_path)
    else:
        metadata = {}
        content = extract_text(file_path)

    doc_id = construct_document_id(file_path, checksum)
    record = {
        "filename":file_path.split('/')[-1],
        "site": get_site_name(file_path),
        "document_id": doc_id,
        "title": metadata.get("title", ""),
        "authors": metadata.get("authors", []),
        "pub_year": metadata.get("pub_year", ""),
        "language": metadata.get("language", ""),
        "download_url": "",  # Add if stored
        "checksum": checksum,
        "scraped_at": datetime.utcnow().isoformat() + "Z",
        "content": content
    }

    return record

In [25]:
def walk_downloads():
    all_records = []
    for root, _, files in os.walk(DOWNLOAD_FOLDER):
        if(files):
            for file in range(5):
                if files[file].lower().endswith((".pdf", ".epub", ".htm", ".html")):
                    full_path = os.path.join(root, files[file])
                    record = process_file(full_path)
                    if record:
                        all_records.append(record)
    return all_records

In [26]:
metadata = walk_downloads()
print(metadata)
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)
print(f"[✓] Processed {len(metadata)} documents. Output written to {OUTPUT_JSON}")

[]
[✓] Processed 0 documents. Output written to Metadata.json


#### Test

In [12]:
#   Checking the content extraction of english doc
file_path=r"Downloads\sanskritdocuments_org\17772.pdf"
#Set poppler path in system variables 
images = convert_from_path(file_path, poppler_path=r"C:\poppler-24.08.0\Library\bin")
print(images)
text = ""
for img in images:
    text += pytesseract.image_to_string(img)
text
    

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1067x1655 at 0x2223C4EE270>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1113x1651 at 0x2223C4ECAE0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1038x1591 at 0x2223C4ECBB0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1094x1591 at 0x2223C4ECC80>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=992x1591 at 0x2223C4ECD50>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1094x1591 at 0x2223C4ECE20>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=992x1591 at 0x2223C4ECEF0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1094x1591 at 0x2223C4ECFC0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=992x1591 at 0x2223C4ED090>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1094x1591 at 0x2223C4ED160>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=992x1591 at 0x2223C4ED230>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1094x1586 at 0x2223C4ED300>, <PIL.PpmImagePlugin

'GOVERNMENT OF INDIA\nDEPARTMENT OF ARCHAEOLOGY\n\n| CENTRAL ARCHAEOLOGICAL\n| LIBRARY\n\nee\n\ni\n\nCay no. GOI al\n\nD.G.A. 79.\nTHE COMING WORLD CIVILIZATION\nTHE COMING WORLD CIVILIZATION\nCopyright © 1956 by William Ernest Hocking\nPrinted in the United States of America\nAll rights in this book are reserved.\n\nNo part of the book may be used or reproduced\nin any manner whateoever without written per-\nmission except in the case of brief quotations\nembodied in critical articles and reviews, For\ninformation address Harper ¢ Brothers\n49 Bast 33rd Street, New York iG, N. ¥.\n\nFIRST EDITION\ner\nLibrary of Congress catalog card number: 56-10210\n\nTo\n\nJOHN WAUGH SCOTT\nof Glasgow and Cardiff\nwho has seen from afar\n\nand has aided\nthe arriving civitas mundi\n\nUT JAM UNIVERSUS HIC MUNDUS\nUNA CIVITAS SIT\nCOMMUNIS DEORUM ATQUE HOMINUM\nEXISTIMANDA\n\nCicero, De Legibua, 1, 7\nCONTENTS\n\nEnvoi\nstupy I. THE IMPOTENCE OF THE STATE\n\ni. The State as Developer of Human Nature\

In [24]:
text

'be tetas at * aR\n\nif\n\n~\n« < e\n” < ‘ . 4\n‘ 4 ‘i b =\n~\ni :\nTs ye sie > op\n“~ ; F r ~ % ‘\n: a ail K a>\n’ i F és\ny * a\nLs\nZ ; .\nSrimad Bhagavad Gita\n\n* Sanskrit Text\n* Hindi Translation and\n* English Translation\n\nSS ee ee ————E———————EeEeE—EeEe——————eEeEE\n\n(eee, ferdt wd sa)\n\nSrimad Bhagavad Gita\n(Sanskrit, Hindi & English)\n\nwFtAto,\nLPR ang\n\nEAS jf\n\nUy $a2G,\n\n©Star Publication 1993\n\nISBN 81-7 144-033-9\n\nPublishers :\n\nSTAR PUBLICATIONS PVT. LTD.\n4/5, Asaf Ali Road,\nNEW DELHI-110 002\n\nPrice : Rs. 350.00 (in India)\n$ 39.95 (abroad)\n\nsole distributors for India :\nHINDI BOOK CENTRE\nAsaf Ali Road,\nNEW DELHI-110 002\n\nPhoto-Typesetting by PULLSHOPPE, NEW DELHI-110 002 Ph. : 730502\nand printed at PRINT ART. Naraina-!, New Delhi\n\nS le) s\n\nZZ WM\n\n“lp.\n\nS\n\n“is\n\nSS\n\n| nage I\n\nby\n\nne:\n\nOW AD \\Lar_ Bogle R3)) Nin 17:6 oy Dts |stael\n\nZz\n\ncefla 350\n\n“Sresmad tar” ered & As od ar ait F | het oe F at star at\nad grerael Her 1

In [22]:
reader = PdfReader(file_path)
meta = reader.metadata or {}
op_dict={
            "title": meta.title or "",
            "authors": [meta.author] if meta.author else [],
            "pub_year": meta.get('/CreationDate', '')[2:6] if '/CreationDate' in meta else "",
            "language": meta.get('/Lang', '')
        }
op_dict

{'title': '', 'authors': [], 'pub_year': '2010', 'language': ''}

In [23]:
meta

{'/CreationDate': "D:20100405165310+05'30'",
 '/Creator': 'Adobe Acrobat 9.0',
 '/ModDate': "D:20100408123500-07'00'",
 '/Producer': 'Adobe Acrobat 9.0 Image Conversion Plug-in'}

In [16]:
for root, _, files in os.walk(DOWNLOAD_FOLDER):
    print(files)

[]
['10001.pdf', '10011.pdf', '10013.pdf', '10017.pdf', '10021.pdf', '10023.pdf', '10029.pdf', '10049.pdf', '10051.pdf', '10052.pdf', '10064.pdf', '10069.pdf', '10072.pdf', '10079.pdf', '10082.pdf', '10091.pdf', '10093.pdf', '10107.pdf', '1012.pdf', '10136.pdf', '10153.pdf', '10173.pdf', '10192.pdf', '1020.pdf', '10209.pdf', '10211.pdf', '10227.pdf', '10316.pdf', '10360.pdf', '10373.pdf', '10418.pdf', '10422.pdf', '10455.pdf', '10530.pdf', '10549.pdf', '10571.pdf', '10588.pdf', '10592.pdf', '10594.pdf', '10599.pdf', '10601.pdf', '10607.pdf', '10612.pdf', '10617.pdf', '10620.pdf', '10626.pdf', '10643.pdf', '10645.pdf', '10646.pdf', '10659.pdf', '10692.pdf', '10708.pdf', '10749.pdf', '10750.pdf', '10761.pdf', '10774.pdf', '10775.pdf', '10784.pdf', '10788.pdf', '10801.pdf', '10802.pdf', '10803.pdf', '10804.pdf', '10805.pdf', '10809.pdf', '10810.pdf', '10812.pdf', '10813.pdf', '10814.pdf', '10815.pdf', '10816.pdf', '10817.pdf', '10826.pdf', '10830.pdf', '10831.pdf', '10839.pdf', '10840.pdf