In [None]:
pip install scrapy



In [None]:
import scrapy
import pandas as pd
from scrapy.crawler import CrawlerProcess

class MultiPageSpider(scrapy.Spider):
    name = 'multipage'
    start_urls = [

        'https://fr.wikipedia.org/wiki/Taj_Mahal',

    ]

    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.DataFramePipeline': 1,
        }
    }

    def parse(self, response):
        page_url = response.url

        #  the introduction
        introduction_paragraphs = []
        intro_siblings = response.xpath('//h2/preceding-sibling::*')
        for sib in intro_siblings:
            if sib.xpath('name()').get() == 'p':
                text = ''.join(sib.xpath('.//text()').getall()).strip()
                if text:
                    introduction_paragraphs.append(text)
            elif sib.xpath('name()').get() == 'h2':
                break  # Stop if we reach an h2 tag

        if introduction_paragraphs:
            introduction_content = ' '.join(introduction_paragraphs).strip()
            yield {
                'page_url': page_url,
                'section_title': 'Introduction',
                'section_content': introduction_content,
            }

        # sections and paragraphs
        sections = response.xpath('//h2 | //h3')
        for section in sections:
            section_title = section.xpath('.//span[@class="mw-headline"]/text()').get()
            if section_title:
                section_title = section_title.strip()

                paragraphs = []
                sibling = section.xpath('following-sibling::*')
                for sib in sibling:
                    tag = sib.xpath('name()').get()
                    if tag in ['h2', 'h3']:
                        break
                    if tag == 'p':
                        text = ''.join(sib.xpath('.//text()').getall()).strip()
                        if text:
                            paragraphs.append(text)

                section_content = ' '.join(paragraphs).strip()

                yield {
                    'page_url': page_url,
                    'section_title': section_title,
                    'section_content': section_content,
                }

class DataFramePipeline:
    df = pd.DataFrame()

    def __init__(self):
        self.items = []

    def process_item(self, item, spider):
        self.items.append(item)
        return item

    def close_spider(self, spider):
        if self.items:
            DataFramePipeline.df = pd.DataFrame(self.items)
            DataFramePipeline.df.sort_values(by='page_url', inplace=True)  # Sort by URL
            pd.set_option('display.max_columns', None)
            pd.set_option('display.expand_frame_repr', False)
            print(DataFramePipeline.df)
        else:
            DataFramePipeline.df = pd.DataFrame()
            print("No items to display")

# the crawler
process = CrawlerProcess()
process.crawl(MultiPageSpider)
process.start()

#  DataFrame
df = DataFramePipeline.df
print("DataFrame:")
print(df)

# csv
df.to_csv('scraped_data.csv', index=False)
print("DataFrame saved to 'scraped_data.csv'")


INFO:scrapy.utils.log:Scrapy 2.11.2 started (bot: scrapybot)
2024-07-19 09:33:02 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.3.0, Python 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.2 4 Jun 2024), cryptography 42.0.8, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2024-07-19 09:33:02 [scrapy.utils.log] INFO: Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.3.0, Python 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.2 4 Jun 2024), cryptography 42.0.8, Platform Linux-6.1.85+-x86_64-with-glibc2.35
INFO:scrapy.addons:Enabled addons:
[]
2024-07-19 09:33:02 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler

                                     page_url                    section_title                                    section_content
0     https://fr.wikipedia.org/wiki/Taj_Mahal                     Introduction  Le Taj Mahal (en devanagari ताजमहल, en persan ...
29    https://fr.wikipedia.org/wiki/Taj_Mahal                   Liens externes                                                   
28    https://fr.wikipedia.org/wiki/Taj_Mahal                Articles connexes                                                   
26    https://fr.wikipedia.org/wiki/Taj_Mahal                    Bibliographie                                                   
24    https://fr.wikipedia.org/wiki/Taj_Mahal                       Voir aussi                                                   
..                                        ...                              ...                                                ...
41  https://fr.wikipedia.org/wiki/Tour_Eiffel      Tour de très grande hauteur  À son inau

In [None]:
df

Unnamed: 0,page_url,section_title,section_content
0,https://fr.wikipedia.org/wiki/Taj_Mahal,Introduction,"Le Taj Mahal (en devanagari ताजमहल, en persan ..."
29,https://fr.wikipedia.org/wiki/Taj_Mahal,Liens externes,
28,https://fr.wikipedia.org/wiki/Taj_Mahal,Articles connexes,
26,https://fr.wikipedia.org/wiki/Taj_Mahal,Bibliographie,
24,https://fr.wikipedia.org/wiki/Taj_Mahal,Voir aussi,
...,...,...,...
41,https://fr.wikipedia.org/wiki/Tour_Eiffel,Tour de très grande hauteur,"À son inauguration, la tour Eiffel est la stru..."
42,https://fr.wikipedia.org/wiki/Tour_Eiffel,Fréquentation de la tour Eiffel,Après le succès populaire pendant l’Exposition...
43,https://fr.wikipedia.org/wiki/Tour_Eiffel,Exploitation commerciale,
45,https://fr.wikipedia.org/wiki/Tour_Eiffel,Revenus de l'exploitation,"La tour Eiffel a coûté 7,8 millions de francs-..."
