# IETF_MailCrawler
Crawler of IETF wg email 

## Package Setup

In [18]:
from scrapy import Field
from scrapy.loader import ItemLoader, Item
from scrapy.loader.processors import MapCompose
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from w3lib.html import remove_tags

## IETFItem

In [19]:
class IETFMailItem(Item):
    title = Field()
    date = Field()
    content = Field()

## Crawler
Steps: 
1. FOLLOWING: Follow the urls specified in the 1st Rule.
2. FOLLOWING: Follow the urls specified in the follow function and paginate on the same level
3. SCRAPING: Scrape the fields and populate item.

In [20]:
class IETFSpider(CrawlSpider):
    name = 'IETF'
    start_urls = ['https://mailarchive.ietf.org/arch/browse/']  # LEVEL 1

    # 1. FOLLOWING
    rules = (
        Rule(LinkExtractor(restrict_css='li.browse-link'), callback='parse'), # LEVEL 1
    )

    # 2. FOLLOWING LEVEL 2
    def follow(self, response):
        for follow_url in response.css(".xtd.msg-detail").extract():
            yield response.follow(follow_url, self.populate_item)
        yield self.paginate(response)

    # 2. SCRAPING LEVEL 3
    def populate_item(self, response):
        item_loader = ItemLoader(item=IETFMailItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)

        # mapping item field to the website content
        item_loader.add_css("title", "#msg-body h3::text")
        item_loader.add_css("date", "#msg-info #msg-from::text")
        item_loader.add_css("content", "pre.wordwrap::text")

        yield item_loader.load_item()

    # 3. PAGINATION LEVEL 2
    def paginate(self, response):
        next_page_url = response.css(".xtd.msg-detail").extract_first()  # pagination("next button") <a> element here
        if next_page_url is not None:
            return response.follow(next_page_url, self.parse)

## Execution

In [26]:
!scrapy

Scrapy 2.8.0 - no active project

Usage:
  scrapy <command> [options] [args]

Available commands:
  bench         Run quick benchmark test
  fetch         Fetch a URL using the Scrapy downloader
  genspider     Generate new spider using pre-defined templates
  runspider     Run a self-contained spider (without creating a project)
  settings      Get settings values
  shell         Interactive scraping console
  startproject  Create new project
  version       Print Scrapy version
  view          Open URL in browser, as seen by Scrapy

  [ more ]      More commands available when run from project directory

Use "scrapy <command> -h" to see more info about a command


In [27]:
!scrapy shell


2023-04-24 20:22:16 [scrapy.utils.log] INFO: Scrapy 2.8.0 started (bot: scrapybot)
2023-04-24 20:22:16 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 1.21.0, Twisted 22.10.0, Python 3.9.16 (main, Mar  8 2023, 04:29:44) - [Clang 14.0.6 ], pyOpenSSL 23.1.1 (OpenSSL 1.1.1t  7 Feb 2023), cryptography 39.0.1, Platform macOS-10.16-x86_64-i386-64bit
2023-04-24 20:22:16 [scrapy.crawler] INFO: Overridden settings:
{'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
 'LOGSTATS_INTERVAL': 0}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-04-24 20:22:16 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-04-24 20:22:16 [scrapy.extensions.telnet] INFO: Telnet Password: 401bab528d26ba5b
2023-04-24 20:22:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
