In [8]:
import os
import time
#import fitz 
import requests
import pandas as pd
from bs4 import BeautifulSoup
import html2text
from tqdm import tqdm

In [9]:
# ====== SETTINGS ======
EXCEL_FILE = r"E:\Income Tax Fine-tuning\Web Scraping URL's.xlsx"
URL_COLUMN = "Source"
SAVE_HTML_DIR = "scraped_html"
SAVE_PDF_DIR = "scraped_pdfs"
COMBINED_OUTPUT = "indian_tax_code_combined.txt"
HEADERS = {"User-Agent": "Mozilla/5.0"}

In [10]:
# ====== SETUP ======
os.makedirs(SAVE_HTML_DIR, exist_ok=True)
os.makedirs(SAVE_PDF_DIR, exist_ok=True)

In [20]:
# ====== UTILITY FUNCTIONS ======

def normalize_url(url):
    url = str(url).strip()
    if url.lower().startswith("wikipedia"):
        return "https://en.wikipedia.org/wiki/Income_tax_in_India"
    if url and not url.startswith("http"):
        if "." in url:
            return "https://" + url
    return url if url.startswith("http") else ""

def load_urls_from_excel(file_path, url_column="URL"):
    df = pd.read_excel(file_path)
    urls = (
        df[url_column]
        .astype(str)
        .apply(lambda x: x.strip())
    )
    # Only keep rows where the URL string length is reasonable
    urls = urls[urls.str.len() > 5]
    urls = urls.apply(normalize_url)
    urls = [u for u in urls if u.startswith("http")]
    print(f"[INFO] Loaded {len(urls)} valid URLs after cleaning")
    return urls


def get_html(url):
    try:
        res = requests.get(url, headers=HEADERS, timeout=15)
        res.raise_for_status()
        return res.text
    except Exception as e:
        print(f"[HTML ERROR] {url} -> {e}")
        return ""

def clean_html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "header", "footer", "nav"]):
        tag.decompose()
    text = html2text.html2text(soup.get_text())
    return '\n'.join([line.strip() for line in text.splitlines() if line.strip()])

def extract_text_from_pdf(url, save_path):
    try:
        r = requests.get(url, headers=HEADERS)
        with open(save_path, "wb") as f:
            f.write(r.content)
        text = ""
        with fitz.open(save_path) as doc:
            for page in doc:
                text += page.get_text()
        return text.strip()
    except Exception as e:
        print(f"[PDF ERROR] {url} -> {e}")
        return ""

In [21]:
# ====== SCRAPER MAIN ======

def run_scraper_from_excel(excel_file):
    urls = load_urls_from_excel(excel_file, url_column=URL_COLUMN)
    print(f"[INFO] Loaded {len(urls)} valid URLs from {excel_file}")

    all_texts = []
    failed_urls = []

    for i, url in enumerate(tqdm(urls, desc="Scraping")):
        if not url.startswith("http"):
            print(f"[SKIPPED] Invalid URL format: {url}")
            failed_urls.append((url, "Invalid format"))
            continue

        filename_prefix = f"doc_{i:03}"
        text = ""

        try:
            if url.lower().endswith(".pdf"):
                save_path = os.path.join(SAVE_PDF_DIR, filename_prefix + ".pdf")
                text = extract_text_from_pdf(url, save_path)
            else:
                html = get_html(url)
                if not html:
                    failed_urls.append((url, "Empty HTML response"))
                    continue
                text = clean_html_to_text(html)
                save_path = os.path.join(SAVE_HTML_DIR, filename_prefix + ".txt")
                with open(save_path, "w", encoding="utf-8") as f:
                    f.write(text)

            # Save with metadata
            if len(text.strip()) > 100:
                all_texts.append(f"[SOURCE: {url}]\n{text.strip()}")
            else:
                failed_urls.append((url, "Too little text"))

        except Exception as e:
            print(f"[ERROR] {url} -> {e}")
            failed_urls.append((url, str(e)))

        # Smart sleep
        if any(d in url for d in ["cleartax", "taxmann"]):
            time.sleep(2.5)
        else:
            time.sleep(1.0)

    # Combine all into one file
    with open(COMBINED_OUTPUT, "w", encoding="utf-8") as f:
        f.write("\n\n".join(all_texts))

    print(f"[✅ DONE] Combined output saved to: {COMBINED_OUTPUT}")

    # Save failed URL log
    if failed_urls:
        log_file = "scraping_failed_urls.csv"
        pd.DataFrame(failed_urls, columns=["URL", "Reason"]).to_csv(log_file, index=False)
        print(f"[⚠️ LOG] Failed or skipped URLs logged in: {log_file}")


In [22]:
# ====== RUN ======
run_scraper_from_excel(EXCEL_FILE)

[INFO] Loaded 11 valid URLs after cleaning
[INFO] Loaded 11 valid URLs from E:\Income Tax Fine-tuning\Web Scraping URL's.xlsx


Scraping:   9%|▉         | 1/11 [00:00<00:02,  3.98it/s]

[HTML ERROR] https://incometaxindia.gov.in -> 503 Server Error: Service Temporarily Unavailable for url: https://incometaxindia.gov.in/


Scraping:  27%|██▋       | 3/11 [00:03<00:09,  1.14s/it]

[HTML ERROR] https://egazette.nic.in -> HTTPSConnectionPool(host='egazette.nic.in', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001919844F8E0>: Failed to resolve 'egazette.nic.in' ([Errno 11001] getaddrinfo failed)"))


Scraping:  91%|█████████ | 10/11 [00:14<00:01,  1.49s/it]

[HTML ERROR] https://vakilno1.com -> 410 Client Error: Gone for url: https://vakilno1.com/


Scraping: 100%|██████████| 11/11 [00:15<00:00,  1.45s/it]

[HTML ERROR] https://legalserviceindia.com -> 406 Client Error: Not Acceptable for url: https://legalserviceindia.com/
[✅ DONE] Combined output saved to: indian_tax_code_combined.txt
[⚠️ LOG] Failed or skipped URLs logged in: scraping_failed_urls.csv





# Special case URL's

In [18]:
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from PyPDF2 import PdfReader
import os
import socket
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse
import logging


In [19]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ========== NETWORK CONNECTIVITY CHECK ==========
def check_internet_connection():
    """Check if internet connection is available"""
    try:
        socket.create_connection(("8.8.8.8", 53), timeout=5)
        return True
    except OSError:
        return False

def check_domain_resolution(domain):
    """Check if a specific domain can be resolved"""
    try:
        socket.gethostbyname(domain)
        return True
    except socket.gaierror:
        return False

In [20]:
# ========== PDF TEXT EXTRACTION ==========
def extract_text(pdf_path):
    """Extract text from PDF file"""
    text = ""
    try:
        if not os.path.exists(pdf_path):
            logger.error(f"PDF file not found: {pdf_path}")
            return ""
        
        reader = PdfReader(pdf_path)
        for page_num, page in enumerate(reader.pages):
            try:
                page_text = page.extract_text() or ""
                text += page_text
            except Exception as e:
                logger.warning(f"Error extracting text from page {page_num}: {e}")
                continue
        return text
    except Exception as e:
        logger.error(f"PDF extraction error for {pdf_path}: {e}")
        return ""


In [21]:
# ========== ROBUST HTTP SESSION ==========
def create_robust_session():
    """Create a robust HTTP session with retries and timeouts"""
    session = requests.Session()
    
    # Configure retries
    retry_strategy = Retry(
        total=3,
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["HEAD", "GET", "OPTIONS"],
        backoff_factor=2
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Set headers
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    })
    
    return session

In [22]:
# ========== SELENIUM SETUP ==========
def create_selenium_driver():
    """Create a robust Selenium WebDriver"""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    try:
        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(30)
        return driver
    except Exception as e:
        logger.error(f"Failed to create Chrome driver: {e}")
        return None


In [23]:
# ========== SCRAPE incometaxindia.gov.in ==========
def scrape_incometaxindia_with_selenium():
    """Scrape Income Tax India website using Selenium"""
    logger.info("Starting scrape of incometaxindia.gov.in")
    
    # Check domain resolution first
    if not check_domain_resolution("incometaxindia.gov.in"):
        logger.error("Cannot resolve incometaxindia.gov.in")
        return ""
    
    driver = create_selenium_driver()
    if not driver:
        return ""
    
    try:
        url = "https://incometaxindia.gov.in"
        driver.get(url)
        
        # Wait for page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text(separator="\n")
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        text = '\n'.join(line for line in lines if line)
        
        with open("scraped_incometaxindia.txt", "w", encoding="utf-8") as f:
            f.write(text)
        
        logger.info("Successfully scraped incometaxindia.gov.in")
        return text
        
    except TimeoutException:
        logger.error("Timeout while loading incometaxindia.gov.in")
        return ""
    except WebDriverException as e:
        logger.error(f"WebDriver error for incometaxindia.gov.in: {e}")
        return ""
    except Exception as e:
        logger.error(f"Unexpected error scraping incometaxindia.gov.in: {e}")
        return ""
    finally:
        if driver:
            driver.quit()

# ========== SCRAPE egazette.nic.in (PDF Example) ==========
def download_sample_egazette_pdf():
    """Download and extract text from egazette PDF"""
    logger.info("Starting egazette.nic.in PDF download")
    
    # Check domain resolution first
    if not check_domain_resolution("egazette.nic.in"):
        logger.error("Cannot resolve egazette.nic.in")
        return ""
    
    session = create_robust_session()
    pdf_url = "https://egazette.nic.in/WriteReadData/2023/248929.pdf"
    
    try:
        response = session.get(pdf_url, timeout=30)
        response.raise_for_status()
        
        pdf_filename = "egazette_sample.pdf"
        with open(pdf_filename, "wb") as f:
            f.write(response.content)
        
        logger.info(f"Downloaded PDF: {pdf_filename}")
        
        text = extract_text(pdf_filename)
        if text:
            with open("scraped_egazette.txt", "w", encoding="utf-8") as f:
                f.write(text)
            logger.info("Successfully extracted text from egazette PDF")
        else:
            logger.warning("No text extracted from egazette PDF")
        
        return text
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Network error downloading egazette PDF: {e}")
        return ""
    except Exception as e:
        logger.error(f"Unexpected error with egazette PDF: {e}")
        return ""

# ========== SCRAPE taxmann.com ==========
def scrape_taxmann_sample():
    """Scrape Taxmann website"""
    logger.info("Starting scrape of taxmann.com")
    
    # Check domain resolution first
    if not check_domain_resolution("taxmann.com"):
        logger.error("Cannot resolve taxmann.com")
        return ""
    
    driver = create_selenium_driver()
    if not driver:
        return ""
    
    try:
        # Try to access a more general page first
        url = "https://www.taxmann.com"
        driver.get(url)
        
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header"]):
            element.decompose()
        
        # Try to find main content
        content = (soup.find("main") or 
                  soup.find("div", class_="content") or 
                  soup.find("div", id="content") or
                  soup.find("body"))
        
        text = content.get_text(separator="\n") if content else ""
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        text = '\n'.join(line for line in lines if line)
        
        with open("scraped_taxmann.txt", "w", encoding="utf-8") as f:
            f.write(text)
        
        logger.info("Successfully scraped taxmann.com")
        return text
        
    except Exception as e:
        logger.error(f"Error scraping taxmann.com: {e}")
        return ""
    finally:
        if driver:
            driver.quit()


In [24]:
# ========== ROBUST SCRAPING WITH REQUESTS ==========
def robust_scrape_with_requests(url, output_file):
    """Robust scraping using requests with comprehensive error handling"""
    logger.info(f"Starting scrape of {url}")
    
    # Parse URL to get domain
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    
    # Check domain resolution
    if not check_domain_resolution(domain):
        logger.error(f"Cannot resolve domain: {domain}")
        return ""
    
    session = create_robust_session()
    
    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header"]):
            element.decompose()
        
        text = soup.get_text(separator="\n")
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        text = '\n'.join(line for line in lines if line)
        
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(text)
        
        logger.info(f"Successfully scraped {url}")
        return text
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Network error scraping {url}: {e}")
        return ""
    except Exception as e:
        logger.error(f"Unexpected error scraping {url}: {e}")
        return ""


In [25]:
# ========== MAIN EXECUTION FUNCTION ==========
def run_all_special_scrapers():
    """Run all scrapers with comprehensive error handling"""
    logger.info("Starting comprehensive scraping process")
    
    # Check internet connectivity first
    if not check_internet_connection():
        logger.error("No internet connection detected. Please check your network.")
        return
    
    # Create output directory
    output_dir = "special_scrape_outputs"
    os.makedirs(output_dir, exist_ok=True)
    os.chdir(output_dir)
    
    combined_texts = []
    scrapers = [
        ("incometaxindia.gov.in", scrape_incometaxindia_with_selenium),
        ("egazette.nic.in", download_sample_egazette_pdf),
        ("taxmann.com", scrape_taxmann_sample),
        ("vakilno1.com", lambda: robust_scrape_with_requests("https://vakilno1.com", "scraped_vakilno1.txt")),
        ("legalserviceindia.com", lambda: robust_scrape_with_requests("https://legalserviceindia.com", "scraped_legalserviceindia.txt"))
    ]
    
    for source_name, scraper_func in scrapers:
        try:
            logger.info(f"Processing {source_name}...")
            text = scraper_func()
            
            if text.strip():
                combined_texts.append(f"[SOURCE: {source_name}]\n{text}")
                logger.info(f"✅ Successfully processed {source_name}")
            else:
                logger.warning(f"⚠️  No content retrieved from {source_name}")
                
        except Exception as e:
            logger.error(f"❌ Failed to process {source_name}: {e}")
        
        # Wait between requests to be respectful
        time.sleep(2)
    
    # Combine all successful outputs
    if combined_texts:
        combined_content = "\n\n" + "="*80 + "\n\n".join(combined_texts)
        
        with open("indian_tax_code_special_sources.txt", "w", encoding="utf-8") as f:
            f.write(combined_content)
        
        logger.info("✅ COMPLETED: Combined all successful scrapes into: indian_tax_code_special_sources.txt")
        logger.info(f"Successfully processed {len(combined_texts)} out of {len(scrapers)} sources")
    else:
        logger.warning("⚠️  No content was successfully scraped from any source")

# ========== EXECUTE ==========
if __name__ == "__main__":
    run_all_special_scrapers()

2025-06-18 18:03:47,193 - INFO - Starting comprehensive scraping process
2025-06-18 18:03:47,294 - INFO - Processing incometaxindia.gov.in...
2025-06-18 18:03:47,295 - INFO - Starting scrape of incometaxindia.gov.in
2025-06-18 18:03:53,350 - INFO - Successfully scraped incometaxindia.gov.in
2025-06-18 18:03:59,528 - INFO - ✅ Successfully processed incometaxindia.gov.in
2025-06-18 18:04:01,539 - INFO - Processing egazette.nic.in...
2025-06-18 18:04:01,539 - INFO - Starting egazette.nic.in PDF download
2025-06-18 18:04:01,542 - ERROR - Cannot resolve egazette.nic.in
2025-06-18 18:04:03,562 - INFO - Processing taxmann.com...
2025-06-18 18:04:03,562 - INFO - Starting scrape of taxmann.com
2025-06-18 18:04:15,952 - INFO - Successfully scraped taxmann.com
2025-06-18 18:04:22,069 - INFO - ✅ Successfully processed taxmann.com
2025-06-18 18:04:24,078 - INFO - Processing vakilno1.com...
2025-06-18 18:04:24,078 - INFO - Starting scrape of https://vakilno1.com
2025-06-18 18:04:24,129 - ERROR - ❌ F

In [28]:
def combine_text_files(file1, file2, output_file):
    with open(file1, "r", encoding="utf-8") as f1, \
         open(file2, "r", encoding="utf-8") as f2, \
         open(output_file, "w", encoding="utf-8") as out:
        
        content1 = f1.read().strip()
        content2 = f2.read().strip()
        
        combined = content1 + "\n\n" + content2
        out.write(combined)
    
    print(f"[✅ DONE] Combined file saved to: {output_file}")

# Example usage
combine_text_files(
    r"E:\Income Tax Fine-tuning\indian_tax_code_combined.txt",
    r"E:\Income Tax Fine-tuning\indian_tax_code_special_sources.txt",
    "indian_tax_code_final_corpus.txt"
)


[✅ DONE] Combined file saved to: indian_tax_code_final_corpus.txt
