In [None]:
!pip install scrapy
!pip install pandas
!pip install boto3

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.7.0-py3-none-any.whl.metadata (18 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_6

In [None]:
%%bash
scrapy startproject news_scraper
cd news_scraper
scrapy genspider livemint www.livemint.com
scrapy genspider economics www.economics.com
scrapy genspider telegraf www.telegraf.com
scrapy genspider inc42 www.inc42.com
scrapy genspider digitalterminal www.digitalterminal.com

New Scrapy project 'news_scraper', using template directory '/usr/local/lib/python3.10/dist-packages/scrapy/templates/project', created in:
    /content/news_scraper

You can start your first spider with:
    cd news_scraper
    scrapy genspider example example.com
Created spider 'livemint' using template 'basic' in module:
  news_scraper.spiders.livemint
Created spider 'economics' using template 'basic' in module:
  news_scraper.spiders.economics
Created spider 'telegraf' using template 'basic' in module:
  news_scraper.spiders.telegraf
Created spider 'inc42' using template 'basic' in module:
  news_scraper.spiders.inc42
Created spider 'digitalterminal' using template 'basic' in module:
  news_scraper.spiders.digitalterminal


In [None]:
!pip install scrapy boto3



In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json

# Define the structure of the article item
class NewsArticle(scrapy.Item):
    article_url = scrapy.Field()
    title = scrapy.Field()
    author_name = scrapy.Field()
    author_url = scrapy.Field()
    article_content = scrapy.Field()
    published_date = scrapy.Field()

# Spider to scrape articles from Livemint
class LivemintSpider(scrapy.Spider):
    name = 'livemint'
    allowed_domains = ['www.livemint.com']
    start_urls = ['https://www.livemint.com/']

    def parse(self, response):
        # Extract links to all category pages
        category_links = response.css('nav.nav a::attr(href)').getall()
        for link in category_links:
            yield response.follow(link, self.parse_category)

    def parse_category(self, response):
        # Extract links to individual articles
        article_links = response.css('h2.headline a::attr(href)').getall()
        for link in article_links:
            yield response.follow(link, self.parse_article)

    def parse_article(self, response):
        # Extract article details
        article = NewsArticle()
        article['article_url'] = response.url
        article['title'] = response.css('h1::text').get()
        article['author_name'] = response.css('span.authorName a::text').get()
        article['author_url'] = response.css('span.authorName a::attr(href)').get()
        article['article_content'] = ' '.join(response.css('div.contentSec p::text').getall())
        article['published_date'] = response.css('span.pubtime::text').get()

        yield article

# Pipeline to write the scraped data to a local JSON file
class JsonWriterPipeline:
    def open_spider(self, spider):
        self.file = open('articles.json', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

# Configure the crawler process
process = CrawlerProcess(settings={
    "FEEDS": {
        "articles.json": {"format": "json"},  # This saves the output to a local JSON file
    },
    "ITEM_PIPELINES": {
        '__main__.JsonWriterPipeline': 300,
    },
    "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
})

# Run the spider
process.crawl(LivemintSpider)
process.start()


INFO:scrapy.utils.log:Scrapy 2.11.2 started (bot: scrapybot)
2024-09-12 15:06:16 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2024-09-12 15:06:16 [scrapy.utils.log] INFO: Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Linux-6.1.85+-x86_64-with-glibc2.35
INFO:scrapy.addons:Enabled addons:
[]
2024-09-12 15:06:16 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class NewsArticle(scrapy.Item):
    article_url = scrapy.Field()
    title = scrapy.Field()
    author_name = scrapy.Field()
    author_url = scrapy.Field()
    article_content = scrapy.Field()
    published_date = scrapy.Field()

class LivemintSpider(scrapy.Spider):
    name = 'livemint'
    allowed_domains = ['livemint.com']
    start_urls = ['https://www.livemint.com/']

    def parse(self, response):
        # Extract category links (e.g., sections like Business, Technology)
        category_links = response.css('nav.nav a::attr(href)').getall()

        # Follow category links
        for link in category_links:
            yield response.follow(link, self.parse_category)

    def parse_category(self, response):
        # Extract article links from the category page
        article_links = response.css('h2.headline a::attr(href)').getall()

        # Log for debugging
        print(f"Found article links: {article_links}")

        # Follow each article link
        for link in article_links:
            yield response.follow(link, self.parse_article)

    def parse_article(self, response):
        # Extract article details
        article = NewsArticle()
        article['article_url'] = response.url
        article['title'] = response.css('h1::text').get()
        article['author_name'] = response.css('span.authorName a::text').get()
        article['author_url'] = response.css('span.authorName a::attr(href)').get()
        article['article_content'] = ' '.join(response.css('div.contentSec p::text').getall())
        article['published_date'] = response.css('span.pubtime::text').get()

        yield article

# Configure and run the spider
process = CrawlerProcess(settings={
    "FEEDS": {
        "articles.json": {"format": "json"},
    },
    "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "DOWNLOAD_DELAY": 1,  # Adding a delay to avoid overloading the server
})

if not process.crawlers:
    process.crawl(LivemintSpider)
    process.start()


INFO:scrapy.utils.log:Scrapy 2.11.2 started (bot: scrapybot)
2024-09-12 15:13:38 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2024-09-12 15:13:38 [scrapy.utils.log] INFO: Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Linux-6.1.85+-x86_64-with-glibc2.35
INFO:scrapy.addons:Enabled addons:
[]
2024-09-12 15:13:38 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler

ReactorNotRestartable: 

In [None]:
!pip install scrapy




In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

# Define your Scrapy item
class NewsArticle(scrapy.Item):
    article_url = scrapy.Field()
    title = scrapy.Field()
    author_name = scrapy.Field()
    author_url = scrapy.Field()
    article_content = scrapy.Field()
    published_date = scrapy.Field()

# Define your spider
class LivemintSpider(scrapy.Spider):
    name = 'livemint'
    allowed_domains = ['livemint.com']
    start_urls = ['https://www.livemint.com/']

    def parse(self, response):
        # Extract category links (e.g., sections like Business, Technology)
        category_links = response.css('nav.nav a::attr(href)').getall()

        # Follow category links
        for link in category_links:
            yield response.follow(link, self.parse_category)

    def parse_category(self, response):
        # Extract article links from the category page
        article_links = response.css('h2.headline a::attr(href)').getall()

        # Follow each article link
        for link in article_links:
            yield response.follow(link, self.parse_article)

    def parse_article(self, response):
        # Extract article details
        article = NewsArticle()
        article['article_url'] = response.url
        article['title'] = response.css('h1::text').get()
        article['author_name'] = response.css('span.authorName a::text').get()
        article['author_url'] = response.css('span.authorName a::attr(href)').get()
        article['article_content'] = ' '.join(response.css('div.contentSec p::text').getall())
        article['published_date'] = response.css('span.pubtime::text').get()

        yield article


In [None]:
# Run the spider
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner

runner = CrawlerRunner({
    "FEEDS": {
        "articles.json": {"format": "json"},
    },
    "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "DOWNLOAD_DELAY": 1,  # Delay to avoid overloading the server
})

# Run spider within the reactor's event loop
def run_spider():
    deferred = runner.crawl(LivemintSpider)
    deferred.addBoth(lambda _: reactor.stop())

run_spider()

# Start the Twisted reactor to run Scrapy
reactor.run()




See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)


In [None]:
import json

# Load the JSON file
with open('articles.json') as f:
    articles = json.load(f)

# Print the first few articles
for article in articles[:5]:
    print(json.dumps(article, indent=2))
