<a href="https://colab.research.google.com/github/Nikakhtar/dynascrap/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scrapy
!pip install nest-asyncio
!pip install scrapy-user-agents
!pip install pandas

import json
import os
import scrapy
import pandas as pd
import sqlite3
import asyncio
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor, defer
import nest_asyncio
from io import StringIO

# Fix async issues in Google Colab
nest_asyncio.apply()

# Define Dynamic Scrapy Spider
class DynamicNewsSpider(scrapy.Spider):
    name = "dynascrap_news"

    def __init__(self, website_url, list_rules_json, item_rules_json, search_keyword, file_name, pagination_urls=None, *args, **kwargs):
        super(DynamicNewsSpider, self).__init__(*args, **kwargs)
        self.website_url = website_url
        self.search_keyword = search_keyword
        self.list_rules = json.loads(list_rules_json)
        self.item_rules = json.loads(item_rules_json)
        self.start_urls = [website_url] + (pagination_urls or [])
        self.file_name = file_name  # Unique filename for each website

    # Scrapy will save output in this file
    custom_settings = {
        'FEEDS': {
            'output.json': {'format': 'json', 'encoding': 'utf8'}
        },
        'LOG_ENABLED': True  # Enable logging for debugging
    }
    def parse(self, response):

        # First, scrape the listing page to extract article URLs
        article_links = response.css(self.list_rules["item_url"]).getall()

        if not article_links:
            self.log(f"No article links found on {response.url}")
            return

        for article_url in article_links:
            absolute_url = response.urljoin(article_url)  # Convert relative URL to absolute
            print(f"----=----> {absolute_url}")
            #absolute_url = article_url
            yield response.follow(absolute_url, callback=self.parse_item)

        next_page_url = response.css(self.list_rules["next_page_url"]).get()
        if next_page_url:
            absolute_next_page_url = response.urljoin(next_page_url)
            print(f"----****----> NEXT PAGE {absolute_next_page_url}")
            yield response.follow(absolute_next_page_url, callback=self.parse)

    def parse_item(self, response):
        item = {}
        for key, selector in self.item_rules.items():
            item[key] = response.css(selector).get()

        item['article_url'] = response.url
        yield item

# Function to create a clean filename
def sanitize_filename(url):
    return f"articles_{url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_')}.json"

# Function to run multiple spiders in the same process
def run_multiple_spiders():
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    @defer.inlineCallbacks
    def crawl():
        for site in websites:
            # Pass settings properly before running Scrapy
            settings.set('FEEDS', {file_name: {'format': 'json', 'encoding': 'utf8'}})
            print(f"Starting Scrapy for {site['website_url']} → Saving to {file_name}")

            # Run Scrapy spider
            yield runner.crawl(DynamicNewsSpider,
                               website_url=site["website_url"],
                               list_rules_json=site["scraping_rules"],
                               item_rules_json=site["scraping_rules"],
                               search_keyword=site["search_keyword"],
                               file_name=file_name,
                               pagination_urls=site["pagination_urls"])

        reactor.stop()

    crawl()
    reactor.run()

# Function to wait for JSON file to be fully saved
def wait_for_file(timeout=60):
    import time
    start_time = time.time()

    while not os.path.exists(file_name):
        if time.time() - start_time > timeout:
            print(f"Timeout: File {file_name} not found after {timeout} seconds!")
            return
        time.sleep(2)  # Wait 2 seconds before checking again

# Function to process JSON data and store it in SQLite
def process_data():
    if not os.path.exists(file_name):
        print(f"File {file_name} not found! Skipping...")
        return

    print(f"Processing {file_name} and storing in database...")

    try:
        # Read and fix the JSON file
        with open(file_name, 'r+', encoding='utf-8') as f:
            data = f.read()

            # Fix broken JSON list structure
            fixed_data = data.replace('][', ',')

            # Write back the corrected JSON data
            f.seek(0)  # Move to the beginning of the file
            f.write(fixed_data)
            f.truncate()  # Remove any remaining old content

        # Load fixed JSON into pandas
        df = pd.read_json(StringIO(fixed_data))

    except (ValueError, json.JSONDecodeError) as e:
        print(f"Error reading or parsing JSON from {file_name}: {e}")
        return

    conn = sqlite3.connect("dynascrap.db")
    cursor = conn.cursor()

    # Create table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS articles (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            article_title TEXT,
            article_author TEXT,
            article_source_name TEXT,
            article_summary TEXT,
            article_content TEXT,
            article_subject TEXT,
            article_tags TEXT,
            article_main_pic TEXT,
            article_publish_date TEXT,
            article_url TEXT
        )
        """)
    conn.commit()

    # Insert data dynamically
    for _, row in df.iterrows():
        cursor.execute("""
            INSERT INTO articles (article_title, article_author, article_source_name, article_summary,
            article_content, article_subject, article_tags, article_main_pic, article_publish_date, article_url)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
            (
                row.get('article_title', "-"),
                row.get('article_author', "-"),
                row.get('article_source_name', "-"),
                row.get('article_summary', "-"),
                row.get('article_content', "-"),
                row.get('article_subject', "-"),
                ','.join(row.get('article_tags', [])),
                row.get('article_main_pic', "-"),
                row.get('article_publish_date', "-"),
                row.get('article_url', "-")
            )
        )

    conn.commit()
    conn.close()

# List of websites and their scraping rules

websites = [
    {
        "website_url": "https://books.toscrape.com/catalogue/category/books/mystery_3/index.html",
        "scraping_rules": json.dumps({
            "item_url": "article h3 a::attr(href)",
            "next_page_url": "#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a::attr(href)",
            "article_title": "article h1::text",
            "article_content": "#content_inner > article > p *::text",
            "article_author": "#content_inner > article > div.row > div.col-sm-6.product_main > p.price_color",
            "article_url": "h3 a::attr(href)"
        }),
        "search_keyword": "mystery",
        "pagination_urls": [
        ]
    },
    {
        "website_url": "https://books.toscrape.com/catalogue/category/books/biography_36/index.html",
        "scraping_rules": json.dumps({
            "item_url": "article h3 a::attr(href)",
            "next_page_url": "#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a::attr(href)",
            "article_title": "article h1::text",
            "article_content": "#content_inner > article > p *::text",
            "article_author": "#content_inner > article > div.row > div.col-sm-6.product_main > p.price_color",
            "article_url": "h3 a::attr(href)"
        }),
        "search_keyword": "science",
        "pagination_urls": [
        ]
    }
]


file_name = 'output.json'
# Run all scrapers in the same process
run_multiple_spiders()

# Ensure JSON file is properly saved before processing
wait_for_file()
process_data()

print("Scraping completed and data stored successfully!")


Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_6