In [None]:
# %pip install requests
# %pip install langdetect
# %pip install beautifulsoup4 lxml 
# %pip install scrapy
# %pip install hashlib



In [None]:
import requests
from bs4 import BeautifulSoup
from langdetect import detect
import csv
import time



MAX_PAGES = 20




def is_gujarati(text):
    """Check if the text is in Gujarati using language detection."""
    try:
        return detect(text) == 'gu'
    except:
        return False

def crawl_page(url):
    """Crawl a single webpage and return Gujarati text."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return []

    soup = BeautifulSoup(response.content, 'lxml')
    paragraphs = soup.find_all(['p', 'div'])
    
    gujarati_texts = []
    for para in paragraphs:
        text = para.get_text(strip=True)
        if is_gujarati(text):
            gujarati_texts.append(text)
    
    return gujarati_texts

def save_to_csv(data, filename):
    """Save extracted data to a CSV file."""
    with open(filename, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        # writer.writerow(["Content"])
        for row in data:
            writer.writerow([row])

def crawl_multiple_pages(start_url, max_pages=MAX_PAGES, delay=1):
    """Crawl multiple pages starting from the given URL, handle pagination if needed."""
    all_gujarati_texts = []
    current_url = start_url
    for i in range(max_pages):
        print(f"Crawling page {i+1}: {current_url}")
        page_texts = crawl_page(current_url)
        all_gujarati_texts.extend(page_texts)

        # For this example, we just simulate multiple pages by appending `page=i` to URL.
        # You may need to modify this based on the website's pagination structure.
        next_page_url = current_url + f"?page={i+1}"
        current_url = next_page_url
        
        # Avoid overloading the server
        time.sleep(delay)

    return all_gujarati_texts


if __name__ == "__main__":
    start_url = "https://www.divyabhaskar.co.in/"

    gujarati_content = crawl_multiple_pages(start_url, max_pages=5)

    # Save the results to a CSV file
    if gujarati_content:
        save_to_csv(gujarati_content, "gujarati_content.csv")
        print(f"Saved {len(gujarati_content)} lines of Gujarati content to 'gujarati_content.csv'.")
    else:
        print("No Gujarati content found.")


In [5]:
import requests
from bs4 import BeautifulSoup
from langdetect import detect
import time

def is_gujarati(text):
    """Check if the text is in Gujarati using language detection."""
    try:
        return detect(text) == 'gu'
    except:
        return False



In [None]:
def get_links_from_page(url):
    """Extract all links from the specified structure in the webpage."""
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return []

    soup = BeautifulSoup(response.content, 'lxml')

    # Find the main div that contains the ul tag
    main_div = soup.find('div', class_='ba1e62a6')  # Modify class to target the correct div
    if not main_div:
        print(f"No div found with the specified structure on {url}")
        return []

    # Extract the ul tag
    ul_tag = main_div.find('ul')
    if not ul_tag:
        print(f"No ul tag found in the div on {url}")
        return []

    links = []
    # Find all div elements under the ul tag
    div_elements = ul_tag.find_all('div')

    for div in div_elements:
        # Inside each div, find the <a> tag
        a_tag = div.find('a', href=True)
        if a_tag:
            link = a_tag['href']
            links.append(link)

    return links

def crawl_page(url):
    """Crawl a single webpage and return Gujarati text."""
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return []

    soup = BeautifulSoup(response.content, 'lxml')
    paragraphs = soup.find_all(['p', 'div'])

    gujarati_texts = []
    for para in paragraphs:
        text = para.get_text(strip=True)
        if is_gujarati(text):
            gujarati_texts.append(text)

    return gujarati_texts

def crawl_links_and_extract_text(start_url, delay=1):
    """Crawl all links from a page and extract Gujarati text from each link."""
    all_gujarati_texts = []

    # Step 1: Get all the links from the main page
    links = get_links_from_page(start_url)
    if not links:
        print(f"No links found on {start_url}")
        return
    else:
        print(len(links))

    # Step 2: Visit each link and extract Gujarati text
    for i, link in enumerate(links):
        print(f"Crawling link {i+1}/{len(links)}: {link}")
        gujarati_texts = crawl_page(start_url+link)
        all_gujarati_texts.extend(gujarati_texts)

        # Avoid overloading the server
        time.sleep(delay)

    return all_gujarati_texts

if __name__ == "__main__":
    start_url = "https://www.divyabhaskar.co.in/"  # Replace with your URL
    gujarati_content = crawl_links_and_extract_text(start_url)

    # Optionally, print the extracted Gujarati content
    if gujarati_content:
        for content in gujarati_content:
            print(content)
    else:
        print("No Gujarati content found.")


Crawling link: /international/news/for-the-first-time-in-human-history-spacewalk-by-two-astronauts-700-km-above-earth-133632824.html
Saved 32 lines of Gujarati content to 'gujarati_contents.csv'.


In [1]:
import scrapy
import re
import os
from urllib.parse import urlparse
from scrapy.http import HtmlResponse

class GujaratiSpider(scrapy.Spider):
    name = 'gujarati_spider'
    
    # Seed URLs to start with
    start_urls = [
        'https://www.divyabhaskar.co.in/',  # Replace with the actual dynamic Gujarati language website
    ]
    
    # Track visited URLs to avoid revisiting the same links
    visited_urls = set()

    # Regular expression pattern to match Gujarati text
    gujarati_pattern = re.compile(r'[\u0A80-\u0AFF]+')

    # Directory to save each crawled page's content
    output_directory = 'crawled_data'

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse_dynamic_content)

    def parse_dynamic_content(self, response):
        # Extract text from the initial loaded page
        yield from self.extract_and_save_text(response)

        # Simulate scrolling down to load more content if needed
        # You need to implement this part based on the specific behavior of the website
        # For example, you might need to use JavaScript to scroll down or analyze XHR requests

        # Here, we assume that the website loads more content dynamically through scrolling
        # Replace this logic with actual code to handle dynamic loading
        # For demonstration, we just yield the initial response again
        yield response

    def parse(self, response):
        # Extract and save text from the loaded content
        yield from self.extract_and_save_text(response)

        # Get all unique links on the page
        links = response.xpath('//a/@href').getall()
        unique_links = set()
        
        for link in links:
            # Process only valid URLs that match the allowed domain prefixes
            parsed_link = urlparse(link)
            link_domain = parsed_link.netloc
            
            if link.startswith('http') and link_domain in self.allowed_domains and link not in self.visited_urls:
                unique_links.add(link)
                self.visited_urls.add(link)  # Add to visited set

        # Crawl each unique link that has the same domain prefix
        for unique_link in unique_links:
            yield response.follow(unique_link, self.parse)

    def extract_and_save_text(self, response):
        # Extract text from paragraphs and divs
        page_text = response.xpath('//p/text()').getall()
        page_text += response.xpath('//div/text()').getall()
        page_text = ' '.join(page_text).strip()
        
        # Filter only Gujarati text
        gujarati_text = ' '.join(self.gujarati_pattern.findall(page_text))
        
        # Save the Gujarati content to a new text file
        if gujarati_text:
            # Create output directory if it doesn't exist
            if not os.path.exists(self.output_directory):
                os.makedirs(self.output_directory)

            # Generate a unique filename based on the URL or use sequential numbering
            url_hash = hashlib.md5(response.url.encode()).hexdigest()
            filename = os.path.join(self.output_directory, f'{url_hash}.txt')

            with open(filename, 'w', encoding='utf-8') as f:
                f.write(gujarati_text + '\n')
        
        # Return the extracted text for further processing if needed
        return gujarati_text


In [1]:
import os
import re
import hashlib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# Regular expression to match Gujarati text
gujarati_pattern = re.compile(r'[\u0A80-\u0AFF]+')

# Directory to save output files
output_directory = 'crawled_data'

# Function to initialize the WebDriver (Chrome in headless mode)
def init_driver():
    try:
        driver = webdriver.Chrome()
    except:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(options=chrome_options)
        print("Running in headless mode.")
    
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    # chrome_options.add_argument("--no-sandbox")
    # chrome_options.add_argument("--disable-dev-shm-usage")
    
    # # Provide the path to your ChromeDriver
    # driver = webdriver.Chrome(executable_path='/path/to/chromedriver', options=chrome_options)
    return driver

# Function to scroll and load more content dynamically
def scroll_down(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for new content to load
        time.sleep(2)
        
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            # If the height hasn't changed, stop scrolling
            break
        last_height = new_height

# Function to extract Gujarati text from a webpage
def extract_gujarati_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract all text
    page_text = soup.get_text(separator=' ')
    
    # Filter out only Gujarati text
    gujarati_text = ' '.join(gujarati_pattern.findall(page_text))
    
    return gujarati_text

# Function to save the extracted text to a file
def save_text_to_file(url, text):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Generate a unique filename based on the URL
    url_hash = hashlib.md5(url.encode()).hexdigest()
    filename = os.path.join(output_directory, f'{url_hash}.txt')
    
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

# Main function to crawl a dynamic webpage
def crawl_dynamic_website(url):
    driver = init_driver()
    driver.get(url)
    
    # Scroll down to load all dynamic content
    scroll_down(driver)
    
    # Get the page source after all content is loaded
    page_source = driver.page_source
    
    # Extract Gujarati text
    gujarati_text = extract_gujarati_text(page_source)
    
    # Save the text to a file
    if gujarati_text:
        save_text_to_file(url, gujarati_text)
    
    # Close the browser
    driver.quit()

# Example usage:
url = 'https://www.divyabhaskar.co.in/'  # Replace with the actual dynamic website URL
crawl_dynamic_website(url)
