In [None]:
!pip install scrapy


Collecting scrapy
  Downloading Scrapy-2.11.0-py2.py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.4/286.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Twisted<23.8.0,>=18.9.0 (from scrapy)
  Downloading Twisted-22.10.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.1.0-py3-none-any.whl (11 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.8.1-py2.py3-none-any.whl (17 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-23.1.0-py3-none-any.whl (12 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.1.

In [None]:
%%writefile vnc_spider.py
import scrapy

class VNCSpider(scrapy.Spider):
    name = 'vnc1'
    start_urls = [
        f'https://vietnamnet.vn/chinh-tri-page{i}' for i in range(1, 401)
    ] + [
        f'https://vietnamnet.vn/kinh-doanh-page{i}' for i in range(1, 401)
    ] + [
        f'https://vietnamnet.vn/the-thao-page{i}' for i in range(1, 401)
    ] + [
        f'https://vietnamnet.vn/giao-duc-page{i}' for i in range(1, 401)
    ] + [
        f'https://vietnamnet.vn/suc-khoe-page{i}' for i in range(1, 401)
    ] + [
        f'https://vietnamnet.vn/giai-tri-page{i}' for i in range(1, 401)
    ]

    def parse(self, response):
        if response.url.startswith('https://vietnamnet.vn/chinh-tri'):
            category = 'Politics'
        elif response.url.startswith('https://vietnamnet.vn/kinh-doanh'):
            category = 'Business'
        elif response.url.startswith('https://vietnamnet.vn/the-thao'):
            category = 'Sports'
        elif response.url.startswith('https://vietnamnet.vn/giao-duc'):
            category = 'Education'
        elif response.url.startswith('https://vietnamnet.vn/suc-khoe'):
            category = 'Wellness'
        else:
            category = 'Entertainment'

        titles = response.xpath('//div/h3/a/@title').getall()
        links = response.xpath('//div/h3/a/@href').getall()

        for title, link in zip(titles, links):
            yield {'title': title, 'link': response.urljoin(link), 'category': category}

    def get_category(self, url):
        # This method is correctly indented as part of the VNCSpider class.
        if 'chinh-tri' in url:
            return 'Politics'
        elif 'kinh-doanh' in url:
            return 'Business'
        elif 'the-thao' in url:
            return 'Sports'
        elif 'giao-duc' in url:
            return 'Education'
        elif 'suc-khoe' in url:
            return 'Wellness'
        elif 'giai-tri' in url:
            return 'Entertainment'
        else:
            return 'Unknown'


Overwriting vnc_spider.py


In [None]:
%%writefile csv_pipeline.py
import csv

class CsvPipeline:
    def open_spider(self, spider):
        self.file = open('vietnamnet_data.csv', 'w', newline='', encoding='utf-8')
        self.writer = csv.writer(self.file)
        self.writer.writerow(['title', 'category'])  # Header row

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        self.writer.writerow([item['title'], item['category']])
        return item


Writing csv_pipeline.py


In [None]:
from scrapy.crawler import CrawlerProcess
from vnc_spider import VNCSpider
import csv_pipeline


process = CrawlerProcess(settings={
    'ITEM_PIPELINES': {'csv_pipeline.CsvPipeline': 300},
})


process.crawl(VNCSpider)
process.start()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:scrapy.core.scraper:Scraped from <200 https://vietnamnet.vn/giai-tri-page352>
{'title': 'Long Kan kể câu chuyện văn hóa trên sàn diễn thời trang', 'link': 'https://vietnamnet.vn/long-kan-ke-cau-chuyen-van-hoa-tren-san-dien-thoi-trang-2074247.html', 'category': 'Entertainment'}
2023-12-15 18:45:25 [scrapy.core.scraper] DEBUG: Scraped from <200 https://vietnamnet.vn/giai-tri-page352>
{'title': 'Long Kan kể câu chuyện văn hóa trên sàn diễn thời trang', 'link': 'https://vietnamnet.vn/long-kan-ke-cau-chuyen-van-hoa-tren-san-dien-thoi-trang-2074247.html', 'category': 'Entertainment'}
DEBUG:scrapy.core.scraper:Scraped from <200 https://vietnamnet.vn/giai-tri-page352>
{'title': "Chủ tịch Hoa hậu Hòa bình Quốc tế: Thiên Ân trượt top 10 vì 'lưng dài, hông to'", 'link': 'https://vietnamnet.vn/chu-tich-miss-grand-international-tiet-lo-ly-do-thien-an-truot-top-10-2074147.html', 'category': 'Entertainment'}
2023-12-15 18:45:25 [s

In [None]:
import pandas as pd


df = pd.read_csv('vietnamnet_data.csv', encoding='utf-8')


df_sorted = df.sort_values(by='category')


df_sorted.to_csv('sorted_vietnamnet_data.csv', index=False, encoding='utf-8-sig')


In [None]:
from google.colab import files
files.download('sorted_vietnamnet_data.csv')
