In [3]:
import wikipediaapi
import csv
from collections import deque

def wikipedia_scrap():
    gu_wiki = wikipediaapi.Wikipedia(
        user_agent="GujaratiScraperBot",
        language='gu',
        extract_format=wikipediaapi.ExtractFormat.WIKI
    )
    counter = 0
    visited_links = set()
    page_queue = deque()  # Queue for iterative scraping when recursion depth is reached

    # Function to scrape a page and its links with recursion limit handling
    def scrape_page(page_title, writer):
        nonlocal counter
        if page_title in visited_links:
            return
        visited_links.add(page_title)

        try:
            page = gu_wiki.page(page_title)
            if not page.exists() or len(page.text) == 0:
                return

            # Write the page data to CSV
            writer.writerow({
                'URL': page.fullurl,
                'DATA': page.text
            })
            counter += 1
            print(f"Page {page_title} scraped, Count: {counter}")

            # Recursively scrape links
            for link_title in page.links.keys():
                scrape_page(link_title, writer)
        except RecursionError:
            print(f"Recursion limit reached while scraping {page_title}. Switching to iteration.")
            # If recursion limit is reached, add remaining links to the queue for iterative processing
            for link_title in page.links.keys():
                if link_title not in visited_links:
                    page_queue.append(link_title)
        except Exception as e:
            print(f"Some exception occured : {e}")
            return

    # Function to scrape pages iteratively (fallback after recursion limit)
    def scrape_page_iteratively(writer):
        nonlocal counter
        while page_queue:
            try:
                page_title = page_queue.popleft()
                if page_title in visited_links:
                    continue
                visited_links.add(page_title)

                page = gu_wiki.page(page_title)
            
                if not page.exists() or len(page.text) == 0:
                    continue
            except Exception as e:
                print(f"Some exception occured : {e}")
                continue
            
            # Write the page data to CSV
            writer.writerow({
                'URL': page.fullurl,
                'DATA': page.text
            })
            counter += 1
            print(f"Page {page_title} scraped, Count: {counter}")

            # Add links from the current page to the queue
            for link_title in page.links.keys():
                if link_title not in visited_links:
                    page_queue.append(link_title)

    # Main scraping logic
    with open('gujarati_wiki_scraped.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['URL', 'DATA']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Start with the initial page
        start_page_title = "પાયથોન(પ્રોગ્રામિંગ_ભાષા)"
        scrape_page(start_page_title, writer)

        # If recursion limit was reached, continue with iterative scraping
        scrape_page_iteratively(writer)

    print(f"Data has been saved to gujarati_wiki_scraped.csv")

# Call the function to scrape pages
wikipedia_scrap()


Page પાયથોન(પ્રોગ્રામિંગ_ભાષા) scraped, Count: 1
Page C++(પ્રોગ્રામિંગ ભાષા) scraped, Count: 2
Page C (પ્રોગ્રામિંગ ભાષા) scraped, Count: 3
Page C શાર્પ (પ્રોગ્રામિંગ ભાષા) scraped, Count: 4
Page IP એડ્રેસ scraped, Count: 5
Page IPv4 scraped, Count: 6
Page Hypertext Transfer Protocol scraped, Count: 7
Page HTML scraped, Count: 8
Page ઇન્ટરનેટ scraped, Count: 9
Page ISBN (identifier) scraped, Count: 10
Page ઇન્ટરનેશનલ સ્ટાન્ડર્ડ સિરિયલ નંબર scraped, Count: 11
Page ઇન્ટરનેશનલ સ્ટાન્ડર્ડ બુક નંબર scraped, Count: 12
Page ઢાંચાની ચર્ચા:સ્ટબ scraped, Count: 13
Page સભ્ય:Dsvyas scraped, Count: 14
Page અંગ્રેજી ભાષા scraped, Count: 15
Page ઇંગ્લેન્ડ scraped, Count: 16
Page અફઘાનિસ્તાન scraped, Count: 17
Page ઈરાન scraped, Count: 18
Page અક્રોતીરી અને ધેકેલીયા scraped, Count: 19
Page અંગ્રેજી scraped, Count: 20
Page ઓસ્ટ્રેલિયા scraped, Count: 21
Page એડિલેઇડ scraped, Count: 22
Page ઑસ્ટ્રેલિયા scraped, Count: 23
Page ક્વીન્સલેન્ડ scraped, Count: 24
Page ક્રિકેટ scraped, Count: 25
Page ઈસ્ટોનિય

ReadTimeout: HTTPSConnectionPool(host='gu.wikipedia.org', port=443): Read timed out. (read timeout=10.0)