In [1]:
import os
import re
import hashlib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# Regular expression to match Gujarati text
gujarati_pattern = re.compile(r'[\u0A80-\u0AFF]+')

# Directory to save output files
output_directory = 'crawled_data'

# Function to initialize the WebDriver (Chrome in headless mode)
def init_driver():
    try:
        driver = webdriver.Chrome()
    except:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(options=chrome_options)
        print("Running in headless mode.")
    return driver

# Function to scroll and load more content dynamically
def scroll_down(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for new content to load
        time.sleep(2)
        
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            # If the height hasn't changed, stop scrolling
            break
        last_height = new_height

# Function to extract Gujarati text from a webpage
def extract_gujarati_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract all text
    page_text = soup.get_text(separator=' ')
    
    # Filter out only Gujarati text
    gujarati_text = ' '.join(gujarati_pattern.findall(page_text))
    
    return gujarati_text

# Function to save the extracted text to a file
def save_text_to_file(url, text):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Generate a unique filename based on the URL
    url_hash = hashlib.md5(url.encode()).hexdigest()
    filename = os.path.join(output_directory, f'{url_hash}.txt')
    
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)






In [2]:
# Main function to crawl a dynamic webpage
def crawl_dynamic_website(url):
    driver = init_driver()
    driver.get(url)
    
    # Scroll down to load all dynamic content
    scroll_down(driver)
    
    # Get the page source after all content is loaded
    page_source = driver.page_source
    
    # Extract Gujarati text
    gujarati_text = extract_gujarati_text(page_source)
    
    # Save the text to a file
    if gujarati_text:
        save_text_to_file(url, gujarati_text)
    
    # Close the browser
    driver.quit()

In [3]:
# Main function to crawl a dynamic webpage
def crawl_dynamic_website(url):
    driver = init_driver()
    driver.get(url)
    
    # Scroll down to load all dynamic content
    scroll_down(driver)
    
    # Get the page source after all content is loaded
    page_source = driver.page_source
    
    # Extract Gujarati text
    gujarati_text = extract_gujarati_text(page_source)
    
    # Save the text to a file
    if gujarati_text:
        save_text_to_file(url, gujarati_text)
    
    # Close the browser
    driver.quit()

In [4]:
from urllib.parse import urlparse, urljoin

# Function to extract links and Gujarati text from a specific div tag, and recursively visit new links
def crawl_div_and_extract(driver, url, base_url, div_xpath):
    # Visit the webpage
    driver.get(url)
    
    # Get the current page's source after loading
    page_source = driver.page_source
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Extract the specific <div> content
    target_div = soup.select_one(div_xpath)
    
    if target_div:
        # Extract and save Gujarati text from the specific div
        gujarati_text = extract_gujarati_text(str(target_div))
        if gujarati_text:
            save_text_to_file(url, gujarati_text)
        
        # Extract all new links within the div
        new_links = [a['href'] for a in target_div.find_all('a', href=True)]
        
        # Normalize and filter new links to only include those from the same base URL
        valid_links = []
        for link in new_links:
            # Normalize the link
            normalized_link = urljoin(base_url, link)
            parsed_link = urlparse(normalized_link)
            
            # Only allow links that share the same base domain
            if parsed_link.netloc == urlparse(base_url).netloc and normalized_link not in visited_urls:
                valid_links.append(normalized_link)
                visited_urls.add(normalized_link)  # Mark the URL as visited
        
        # Recursively visit the valid links and extract data
        for valid_link in valid_links:
            crawl_div_and_extract(driver, valid_link, base_url, div_xpath)

# Example: Main function to initiate the crawling
def start_recursive_crawl(start_url, div_xpath):
    driver = init_driver()  # Initialize Selenium WebDriver
    base_url = urlparse(start_url).scheme + "://" + urlparse(start_url).netloc  # Get the base URL
    
    # Initialize visited URLs set
    global visited_urls
    visited_urls = set()  # Track visited URLs to avoid loops
    
    # Start crawling from the initial URL
    crawl_div_and_extract(driver, start_url, base_url, div_xpath)
    
    # Close the driver when done
    driver.quit()



In [31]:
import pandas as pd

df = pd.read_csv('url.csv', header=None)
# print(df)

baseUrls = df[0].tolist()
print(baseUrls)

['https://www.divyabhaskar.co.in/', 'https://sandesh.com/', 'https://www.gujaratsamachar.com/', 'https://www.akilanews.com/', 'https://navgujaratsamay.com/', 'https://gujaratmitra.in/']


In [15]:
# Example usage:
url = baseUrls[0]
print(type(baseUrls[0]))
print(type(url[0]))
# crawl_dynamic_website(url)

<class 'numpy.ndarray'>
<class 'str'>


In [None]:
# Example usage:
start_url = baseUrls[0]  # Replace with the actual URL
div_xpath = 'div.ba1e62a6'  # Replace with the XPath or CSS selector of the target div
start_recursive_crawl(start_url, div_xpath)
