In [2]:
# For language detection
#!pip install langdetect 

In [7]:
import requests
from bs4 import BeautifulSoup
import time
import random
from langdetect import detect
import csv

In [8]:
detected_lang = detect("यह हिंदी भाषा में एक लेख है।")
print("Detected Language:", detected_lang)

Detected Language: hi


In [9]:
class WikipediaScraper:
    def __init__(self, max_depth, max_articles, desired_language, start_url, csv_filename):
        self.max_depth = max_depth
        self.max_articles = max_articles
        self.desired_language = desired_language
        self.visited = set()
        self.article_count = 0
        self.base_url = '/'.join(start_url.split('/')[:3])  # e.g., "https://hi.wikipedia.org"
        self.csv_filename = csv_filename
        
        with open(self.csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['title', 'url', 'full_text', 'word_count'])
            writer.writeheader()

    def get_page_content(self, url):
        """Fetch page content with error handling"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def get_first_line(self, soup):
        """Extract the first line of meaningful content"""
        if not soup:
            return None
        content = soup.find('div', class_='mw-parser-output')
        if content:
            for p in content.find_all('p', recursive=False):
                text = p.get_text(strip=True)
                if text:
                    sentences = text.split('. ')
                    return sentences[0] + ('.' if sentences else '')
            return content.get_text(strip=True).split('. ')[0] + '.'
        return None

    def check_language(self, text):
        """Check if the text is in the desired language"""
        if not text:
            return False
        try:
            detected_lang = detect(text)
            return detected_lang == self.desired_language
        except Exception as e:
            print(f"Language detection error: {e}")
            return False

    def extract_article_info(self, soup, url):
        """Extract title and full article text (max 500 words), list items min 5 words"""
        if not soup:
            return None

        title = soup.find('h1', id='firstHeading')
        if not title:
            return None

        content = soup.find('div', class_='mw-parser-output')
        if content:
            full_text = ""
            word_count = 0
            max_words = 500
            min_words_for_points = 5

            for element in content.find_all(['p', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']):
                if word_count >= max_words:
                    break

                if element.get('class') in ['mw-references', 'reflist']:
                    continue

                if element.name == 'li':
                    text = element.get_text(strip=True)
                    words = text.split()
                    if len(words) < min_words_for_points:
                        continue
                    parent = element.find_parent(['ul', 'ol'])
                    prefix = '- ' if parent and parent.name == 'ul' else '1. '
                    text = f"{prefix}{text}"
                else:
                    text = element.get_text(strip=True)

                if text:
                    words = text.split()
                    remaining_words = max_words - word_count
                    
                    if len(words) <= remaining_words:
                        full_text += text + "\n"
                        word_count += len(words)
                    else:
                        truncated_words = words[:remaining_words]
                        full_text += " ".join(truncated_words) + "\n"
                        word_count = max_words
                        break
            
            if full_text.strip():
                return {
                    'title': title.text,
                    'url': url,
                    'full_text': full_text.strip(),
                    'word_count': word_count
                }
        
        return {
            'title': title.text,
            'url': url,
            'full_text': "No content found"
        }

    def get_wiki_links(self, soup):
        """Extract all Wikipedia article links and their display text from the page"""
        if not soup:
            return []
        
        links = []
        content = soup.find('div', class_='mw-parser-output')
        if content:
            for a_tag in content.find_all('a', href=True):
                href = a_tag['href']
                if (href.startswith('/wiki/') and 
                    not ':' in href and 
                    not href.startswith('/wiki/Main_Page')):
                    full_url = self.base_url + href
                    display_text = a_tag.get_text(strip=True) or full_url.split('/')[-1]  # Fallback to URL end if no text
                    links.append((display_text, full_url))
        return links  # Returns list of (display_text, url) tuples
    
    def write_to_csv(self, article_info):
        """Write article info directly to CSV"""
        with open(self.csv_filename, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['title', 'url', 'full_text', 'word_count'])
            writer.writerow(article_info)
            
    def dfs_scrape(self, url, current_depth=0):
        """Recursive DFS traversal with language checking"""
        if (current_depth >= self.max_depth or 
            self.article_count >= self.max_articles or 
            url in self.visited):
            return

        self.visited.add(url)
        
        soup = self.get_page_content(url)
        if not soup:
            return

        # Check language of first line
        first_line = self.get_first_line(soup)
        if not self.check_language(first_line):
            print(f"Depth {current_depth}: Skipping {url.split('/')[-1]} (Language: {detect(first_line)}, Desired: {self.desired_language})")
            return  # Backtrack if language doesn't match

        # Get display text for this URL from the parent page (approximated here)
        title = soup.find('h1', id='firstHeading').text if soup.find('h1', id='firstHeading') else url.split('/')[-1]
        print(f"Depth {current_depth}: Scraping {title}")
        article_info = self.extract_article_info(soup, url)
        
        if article_info:
            self.write_to_csv(article_info)
            self.article_count += 1

        if self.article_count >= self.max_articles:
            return

        links = self.get_wiki_links(soup)
        random.shuffle(links)
        
        for display_text, next_url in links:
            if self.article_count < self.max_articles:
                time.sleep(random.uniform(1, 3))
                self.dfs_scrape(next_url, current_depth + 1)

    def scrape(self, start_url="https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A5%88%E0%A4%A6%E0%A4%B0%E0%A4%BE%E0%A4%AC%E0%A4%BE%E0%A4%A6_%E0%A4%95%E0%A5%87_%E0%A4%A8%E0%A4%BF%E0%A4%9C%E0%A4%BC%E0%A4%BE%E0%A4%AE"):
        """Start the scraping process"""
        self.dfs_scrape(start_url)

In [10]:
def main():
    start_url = "https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A5%88%E0%A4%A6%E0%A4%B0%E0%A4%BE%E0%A4%AC%E0%A4%BE%E0%A4%A6_%E0%A4%95%E0%A5%87_%E0%A4%A8%E0%A4%BF%E0%A4%9C%E0%A4%BC%E0%A4%BE%E0%A4%AE"
    scraper = WikipediaScraper(max_depth=2000, max_articles=2000, desired_language='hi', 
                               start_url=start_url, csv_filename="abc.csv")
    scraper.scrape()
    
    print(f"\nScraping complete. Results written to {scraper.csv_filename}")

if __name__ == "__main__":
    main()

Depth 0: Scraping हैदराबाद के निज़ाम
Depth 1: Scraping जीनोम वैली
Depth 2: Scraping यदाद्री भुवनगरी ज़िला
Depth 3: Scraping हैदराबाद जिला
Depth 4: Scraping अदिलाबाद ज़िला


KeyboardInterrupt: 

In [72]:
import requests
from bs4 import BeautifulSoup
from langdetect import detect
import csv
import random
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import threading

class WikipediaScraper:
    def __init__(self, max_depth, max_articles, desired_language, start_url, csv_filename, max_workers=4):
        self.max_depth = max_depth
        self.max_articles = max_articles
        self.desired_language = desired_language
        self.visited = set()
        self.article_count = 0
        self.base_url = '/'.join(start_url.split('/')[:3])
        self.csv_filename = csv_filename
        self.max_workers = max_workers
        self.lock = threading.Lock()  # For thread-safe CSV writing and counter updates
        
        # Initialize CSV file with headers
        with open(self.csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['title', 'url', 'full_text', 'word_count'])
            writer.writeheader()

    def get_page_content(self, url):
        """Fetch page content with error handling"""
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def get_first_line(self, soup):
        """Extract the first line of meaningful content"""
        if not soup:
            return None
        content = soup.find('div', class_='mw-parser-output')
        if content:
            for p in content.find_all('p', recursive=False):
                text = p.get_text(strip=True)
                if text:
                    sentences = text.split('. ')
                    return sentences[0] + ('.' if sentences else '')
            return content.get_text(strip=True).split('. ')[0] + '.'
        return None

    def check_language(self, text):
        """Check if the text is in the desired language"""
        if not text:
            return False
        try:
            detected_lang = detect(text)
            return detected_lang == self.desired_language
        except Exception as e:
            print(f"Language detection error: {e}")
            return False

    def extract_article_info(self, soup, url):
        """Extract title and full article text (max 500 words), list items min 5 words"""
        if not soup:
            return None

        title = soup.find('h1', id='firstHeading')
        if not title:
            return None

        content = soup.find('div', class_='mw-parser-output')
        if content:
            full_text = ""
            word_count = 0
            max_words = 500
            min_words_for_points = 5

            for element in content.find_all(['p', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']):
                if word_count >= max_words:
                    break
                if element.get('class') in ['mw-references', 'reflist']:
                    continue
                if element.name == 'li':
                    text = element.get_text(strip=True)
                    words = text.split()
                    if len(words) < min_words_for_points:
                        continue
                    parent = element.find_parent(['ul', 'ol'])
                    prefix = '- ' if parent and parent.name == 'ul' else '1. '
                    text = f"{prefix}{text}"
                else:
                    text = element.get_text(strip=True)
                if text:
                    words = text.split()
                    remaining_words = max_words - word_count
                    if len(words) <= remaining_words:
                        full_text += text + "\n"
                        word_count += len(words)
                    else:
                        truncated_words = words[:remaining_words]
                        full_text += " ".join(truncated_words) + "\n"
                        word_count = max_words
                        break
            
            if full_text.strip():
                return {'title': title.text, 'url': url, 'full_text': full_text.strip(), 'word_count': word_count}
        
        return {'title': title.text, 'url': url, 'full_text': "No content found", 'word_count': 0}

    def write_to_csv(self, article_info):
        """Write article info directly to CSV with thread safety"""
        with self.lock:  # Ensure thread-safe writing
            with open(self.csv_filename, 'a', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=['title', 'url', 'full_text', 'word_count'])
                writer.writerow(article_info)

    def get_wiki_links(self, soup):
        """Extract all Wikipedia article links and their display text from the page"""
        if not soup:
            return []
        links = []
        content = soup.find('div', class_='mw-parser-output')
        if content:
            for a_tag in content.find_all('a', href=True):
                href = a_tag['href']
                if (href.startswith('/wiki/') and 
                    not ':' in href and 
                    not href.startswith('/wiki/Main_Page')):
                    full_url = self.base_url + href
                    display_text = a_tag.get_text(strip=True) or full_url.split('/')[-1]
                    links.append((display_text, full_url))
        return links

    def process_page(self, url, depth):
        """Process a single page and return links if successful"""
        with self.lock:
            if (depth >= self.max_depth or 
                self.article_count >= self.max_articles or 
                url in self.visited):
                return []
            self.visited.add(url)

        soup = self.get_page_content(url)
        if not soup:
            return []

        first_line = self.get_first_line(soup)
        if not self.check_language(first_line):
            print(f"Skipping {url.split('/')[-1]} (Language: {detect(first_line)}, Desired: {self.desired_language})")
            return []

        title = soup.find('h1', id='firstHeading').text if soup.find('h1', id='firstHeading') else url.split('/')[-1]
        print(f"Scraping {title}")
        article_info = self.extract_article_info(soup, url)
        
        if article_info:
            self.write_to_csv(article_info)
            with self.lock:
                self.article_count += 1
        
        return self.get_wiki_links(soup) if self.article_count < self.max_articles else []

    def scrape(self, start_url="https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A5%88%E0%A4%A6%E0%A4%B0%E0%A4%BE%E0%A4%AC%E0%A4%BE%E0%A4%A6_%E0%A4%95%E0%A5%87_%E0%A4%A8%E0%A4%BF%E0%A4%9C%E0%A4%BC%E0%A4%BE%E0%A4%AE"):
        """Start the scraping process with parallel workers"""
        queue = Queue()
        queue.put((start_url, 0))  # (url, depth)

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            while not queue.empty() and self.article_count < self.max_articles:
                futures = []
                # Grab up to max_workers items from queue
                for _ in range(min(self.max_workers, queue.qsize())):
                    if queue.empty():
                        break
                    url, depth = queue.get()
                    futures.append(executor.submit(self.process_page, url, depth))

                # Process results and add new links to queue
                for future in futures:
                    links = future.result()
                    random.shuffle(links)
                    for _, next_url in links:
                        if self.article_count < self.max_articles and next_url not in self.visited:
                            queue.put((next_url, depth + 1))
                    time.sleep(random.uniform(1, 3))  # Polite delay between requests

def main():
    start_url = "https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A5%88%E0%A4%A6%E0%A4%B0%E0%A4%BE%E0%A4%AC%E0%A4%BE%E0%A4%A6_%E0%A4%95%E0%A5%87_%E0%A4%A8%E0%A4%BF%E0%A4%9C%E0%A4%BC%E0%A4%BE%E0%A4%AE"
    scraper = WikipediaScraper(max_depth=2000, 
                             max_articles=2000, 
                             desired_language='hi', 
                             start_url=start_url, 
                             csv_filename="wikipedia_scrape.csv",
                             max_workers=4)
    scraper.scrape()
    print(f"\nScraping complete. Results written to {scraper.csv_filename}")

if __name__ == "__main__":
    main()

Scraping हैदराबाद के निज़ाम
Scraping हैदराबाद हाउस
Scraping महाराष्ट्र
Scraping बोल्लारम
Scraping मूसी नदी
Scraping वास्तुकला
Scraping निज़ाम-उल-मुल्क आसफजाह
Scraping आन्ध्र प्रदेश
Scraping हुसैन सागर
Scraping बिरला मंदिर, हैदराबाद
Scraping गोलकोण्डा
Scraping भारत के शहरों की सूची
Scraping कोलकाता
Scraping निजाम संग्रहालय
Scraping चौमोहल्ला पैलेस
Scraping निज़ामाबाद जिला
Scraping आसफ़ जाही राजवंश
Scraping फलकनुमा पैलेस
Scraping मीर उस्मान अली ख़ान
Scraping हैदराबादी खाना
Scraping जीनोम वैली
Scraping हैदराबाद जिला
Scraping गवर्नमेंट निज़ामिआ जनरल हॉस्पिटल
Scraping हैदराबाद विश्वविद्यालय
Scraping संस्कृति
Scraping तेलंगाना
Scraping ग्रेटर हैदराबाद नगर निगम
Scraping नई दिल्ली
Scraping हैदराबाद
Scraping राजशाही विभाग
Scraping बेगमपेट विमानक्षेत्र
Scraping मुग़ल साम्राज्य
Scraping कर्नाटक
Scraping हैदराबाद प्रांत
Scraping शमशाबाद, आगरा
Scraping सिकंदराबाद
Scraping अजमेर शरीफ़
Scraping कन्नड़
Scraping तेलुगू भाषा
Scraping राजीव गांधी अंतर्राष्ट्रीय क्रिकेट स्टेडियम
Scraping राजीव गाँधी अंतर्

LangDetectException: No features in text.

In [73]:
detect('.')
detect(' ')
detect('5')
detect('/')

LangDetectException: No features in text.

In [37]:
import requests
from bs4 import BeautifulSoup
import time
import random
from langdetect import detect
import csv

import numpy as np
import re
import pandas as pd


class scraper:
    def __init__(self, max_recurse, num_articles, max_article_length, desired_lang, root_url, csv_path):
        self.recursion_depth = max_recurse
        self.num_articles = num_articles
        self.desired_lang = desired_lang
        self.max_length = max_article_length
        self.seed_url = root_url
        self.num_reads = 0
        self.sink = csv_path
        self.csv_filename = csv_path
        self.base_url = '/'.join(root_url.split('/')[:3])
        self.visited = set()

        with open(self.sink, 'w', newline= '', encoding='utf-8') as f:
            file = csv.DictWriter(f, ["url","content","word_count"])
            file.writeheader()

    def get_response(self, url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def in_desired_lang(self, text):
        print(text)
        if(len(text)>=2): # ensuring text is not a special character
            try:
                detected_lang = detect(text)
                return detected_lang == self.desired_lang
            except Exception as e:
                print(f"Language detection error: {e}")
                return False
        else:
            print(text)
            print(f"Error while detecting the language")
            return False

    def get_title(self, page):
        if not page:
            return None
        
        title = page.find('h1', id="firstHeading")

        if not title:
            content = page.find('div', class_='mw-parser-output')
            if content:
                for p in content.find_all('p', recursive=False):
                    text = p.get_text(strip=True)
                    if text:
                        sentences = text.split('. ')
                        return sentences[0] + ('.' if sentences else ' ')
                return content.get_text(strip=True).split('. ')[0] + '.'
            
            print("found an invalid page")
            return None
        
        return title.text

    def extract_info(self, page, url):

        if not page:
            return None
        
        content = page.find('div', class_='mw-parser-output')

        if content:
            full_text = ""
            word_count = 0
            min_words_for_points = 5 # ensuring that special characters (or) single characters are not read
            for element in content.find_all(['p', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']):
                if word_count >= self.max_length:
                    break

                else:
                    if element.get('class') in ['mw-references', 'reflist']:
                        continue

                    if element.name == 'li':
                        text = element.get_text(strip=True)
                        words = text.split()
                        if len(words) < min_words_for_points:
                            continue
                        parent = element.find_parent(['ul', 'ol'])
                        prefix = '- ' if parent and parent.name == 'ul' else '1. '
                        text = f"{prefix}{text}"
                    else:
                        text = element.get_text(strip=True)

                    if text:
                        words = text.split()
                        remaining_words = self.max_length - word_count
                        
                        if len(words) <= remaining_words:
                            full_text += text + "\n"
                            word_count += len(words)
                        else:
                            truncated_words = words[:remaining_words]
                            full_text += " ".join(truncated_words) + "\n"
                            word_count = self.max_length
                            break

            if full_text.strip():
                return { 'url': url, 'content': full_text.strip(), 'word_count': word_count }
        
        return {'url': url, 'content': "no content found", 'word_count': word_count}

    def get_links(self, page):
        if not page:
            return []
        
        links = []
        content = page.find('div', class_='mw-parser-output')
        if content:
            for a_tag in content.find_all('a', href=True):
                href = a_tag['href']
                if (href.startswith('/wiki/') and not ':' in href and not href.startswith('/wiki/Main_Page')):
                    full_url = self.base_url + href
                    display_text = a_tag.get_text(strip=True) or full_url.split('/')[-1]
                    links.append((display_text, full_url))

        return links 
    
    def preprocess_text(self, text):
        if not isinstance(text, str) or not text.strip():
            return ""
        
        # Step 1: Split text into sentences/segments (handling newlines and points)
        segments = text.replace('\n', ' ').split(' ')
        
        # Step 2: Keep only Hindi text
        hindi_text = []
        for segment in segments:
            if segment.strip():
                try:
                    if detect(segment) == 'hi':
                        hindi_text.append(segment)
                except:
                    continue
        
        # Rejoin segments into a single string
        text = ' '.join(hindi_text)
        
        # Step 3: Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Step 4: Remove text in parentheses
        text = re.sub(r'\([^()]*\)', '', text)
        
        # Step 5: Remove double quotes
        text = text.replace('"', '')
        
        # Step 6: Remove special characters (keep Hindi characters and spaces)
        # Hindi Unicode range: \u0900-\u097F
        text = re.sub(r'[^\u0900-\u097F\s]', '', text)
        
        # Step 7: Remove extra spaces and normalize
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def write_to_csv(self, info):
        with open(self.csv_filename, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['url', 'content', 'word_count'])
            writer.writerow(info)

    def DFS(self, url, current_depth=0):
        if((url in self.visited) or (self.num_reads >= self.num_articles)):
            return
        
        self.visited.add(url)

        page = self.get_response(url)
        if not page:
            return
        
        title = self.get_title(page)
        if not title:
            return
        
        lang = self.in_desired_lang(title)
        if(not lang):
            print(f"Skipping the url, Language detected is different from desired.")
            return
        
        print(f"Depth {current_depth}: Scraping {title}")
        article_info = self.extract_info(page, url)

        if article_info:
            preprocessed_text = self.preprocess_text(article_info["content"])
            self.write_to_csv({"url":article_info["url"], "content":preprocessed_text, "word_count": article_info["word_count"]})
            self.num_reads += 1

        if self.num_reads >= self.num_articles:
            return
        
        links = self.get_links(page)
        
        if current_depth < self.recursion_depth:  
            for display_text, next_url in links:
                if self.num_reads < self.num_articles:
                    print(f"Scraping {display_text}")
                    self.DFS(next_url, current_depth + 1)  # Go deeper
        
    def fire(self):
        self.DFS(self.seed_url)

In [44]:
def main():
    start_url = "https://hi.wikipedia.org/wiki/%E0%A4%B9%E0%A5%88%E0%A4%A6%E0%A4%B0%E0%A4%BE%E0%A4%AC%E0%A4%BE%E0%A4%A6_%E0%A4%95%E0%A5%87_%E0%A4%A8%E0%A4%BF%E0%A4%9C%E0%A4%BC%E0%A4%BE%E0%A4%AE"
    my_scraper = scraper(max_recurse=4, num_articles=2, max_article_length=500, desired_lang="hi", root_url=start_url, csv_path="scraper.csv")
    my_scraper.fire()
    print(f"Saving to {my_scraper.csv_filename}")
if __name__ == "__main__":
    main()

हैदराबाद के निज़ाम
Depth 0: Scraping हैदराबाद के निज़ाम
Scraping क़मरुद्दीन खान
निज़ाम-उल-मुल्क आसफजाह
Depth 1: Scraping निज़ाम-उल-मुल्क आसफजाह
Saving to scraper.csv
