<a href="https://colab.research.google.com/github/PiehTVH/AI-Learning/blob/main/Data_Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.roTrcFN3PR/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.AbFkrtmPUk/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.WlEPnY53ta/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://de



In [2]:
# from tqdm import tqdm
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("--headless=new")
# chrome_options.add_argument("--no-sandbox")
# driver = webdriver.Chrome(
#     options=chrome_options
# )


# 2. Crawl

In [69]:
import os
import requests
import time
import random
import pprint

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options as FirefoxOptions

user_agent = 'Mozilla'
firefox_options = FirefoxOptions()
firefox_options.add_argument('--headless')
firefox_options.set_preference("javascript.enabled", False)
driver = webdriver.Firefox(options=firefox_options)

# Create a folder for storing articles
root_dir = "./vn_news_corpus"
os.makedirs(root_dir, exist_ok=True)
n_pages = 10 # Change if we want more articles
article_id = 0

for page_idx in tqdm(range(n_pages)):
  # Access to table page
  main_url = f'https://vietnamnet.vn/thoi-su-page{page_idx}'
  driver.get(main_url)

  # Get list of articles (list of URLs)
  news_lst_xpath = '//div[@class="topStory-15nd"]/div/div[1]/a'
  news_tags = driver.find_elements(
      By.XPATH,
      news_lst_xpath
  )
  news_page_urls = [
      news_tag.get_attribute('href') for news_tag in news_tags
  ]

  for news_page_url in news_page_urls:
    # Access to article page
    driver.get(news_page_url)
    time.sleep(1)

    # Try to get main content tag
    main_content_xpath = '//div[@class="content-detail content-mobile-change"]'
    try:
        main_content_tag = driver.find_element(
            By.XPATH,
            main_content_xpath
        )
    except:
        continue

    # Ignore video article
    video_content_xpath = '//div[@class="video-detail"]'
    try:
        video_content_tag = main_content_tag.find_element(
            By.XPATH,
            video_content_xpath
        )
        continue
    except:
        pass

    # Get title (h1 tag)
    title = main_content_tag.find_element(
        By.TAG_NAME,
        'h1'
    ).text.strip()
    # pprint.pprint(title)

    # Get abstract (h2 tag)
    abstract = main_content_tag.find_element(
        By.TAG_NAME,
        'h2'
    ).text.strip()
    # pprint.pprint(abstract)

    # Get author name (span tag)
    try:
      author_xpath = '//span[@class="name"]'
      author = main_content_tag.find_element(
          By.XPATH,
          author_xpath
      ).text.strip()
      # pprint.pprint(author)
    except:
      author = ''

    # Get paragraphs (all p tags in div "maincontent main-content")
    paragraphs_xpath = '//div[@class="maincontent main-content"]/p'
    paragraphs_tags = main_content_tag.find_elements(
        By.XPATH,
        paragraphs_xpath
    )
    paragraphs_lst = [
        paragraphs_tag.text.strip() \
            for paragraphs_tag in paragraphs_tags
    ]
    paragraphs = ' '.join(paragraphs_lst)
    # pprint.pprint(paragraphs)

    # Combine title, abstract, authoor and paragraphs into 1 string
    final_content_lst = [title, abstract, author, paragraphs]
    final_content = str(" ".join(final_content_lst))

    # Save artile to .txt file
    article_filename = f"/article_{article_id}.txt"
    article_savepath = root_dir + article_filename
    pprint.pprint(article_savepath)

    article_id += 1
    with open(article_savepath, "w") as f:
      f.write(final_content)

    # Move back to previous page
    driver.back()

  0%|          | 0/10 [00:00<?, ?it/s]

'./vn_news_corpus/article_0.txt'
'./vn_news_corpus/article_1.txt'
'./vn_news_corpus/article_2.txt'
'./vn_news_corpus/article_3.txt'
'./vn_news_corpus/article_4.txt'
'./vn_news_corpus/article_5.txt'
'./vn_news_corpus/article_6.txt'
'./vn_news_corpus/article_7.txt'
'./vn_news_corpus/article_8.txt'
'./vn_news_corpus/article_9.txt'
'./vn_news_corpus/article_10.txt'
'./vn_news_corpus/article_11.txt'
'./vn_news_corpus/article_12.txt'
'./vn_news_corpus/article_13.txt'


 10%|█         | 1/10 [00:30<04:33, 30.41s/it]

'./vn_news_corpus/article_14.txt'
'./vn_news_corpus/article_15.txt'
'./vn_news_corpus/article_16.txt'
'./vn_news_corpus/article_17.txt'
'./vn_news_corpus/article_18.txt'
'./vn_news_corpus/article_19.txt'
'./vn_news_corpus/article_20.txt'
'./vn_news_corpus/article_21.txt'
'./vn_news_corpus/article_22.txt'
'./vn_news_corpus/article_23.txt'
'./vn_news_corpus/article_24.txt'
'./vn_news_corpus/article_25.txt'
'./vn_news_corpus/article_26.txt'
'./vn_news_corpus/article_27.txt'


 20%|██        | 2/10 [00:55<03:37, 27.14s/it]

'./vn_news_corpus/article_28.txt'
'./vn_news_corpus/article_29.txt'
'./vn_news_corpus/article_30.txt'
'./vn_news_corpus/article_31.txt'
'./vn_news_corpus/article_32.txt'
'./vn_news_corpus/article_33.txt'
'./vn_news_corpus/article_34.txt'
'./vn_news_corpus/article_35.txt'
'./vn_news_corpus/article_36.txt'
'./vn_news_corpus/article_37.txt'
'./vn_news_corpus/article_38.txt'
'./vn_news_corpus/article_39.txt'
'./vn_news_corpus/article_40.txt'
'./vn_news_corpus/article_41.txt'


 30%|███       | 3/10 [01:19<03:00, 25.72s/it]

'./vn_news_corpus/article_42.txt'
'./vn_news_corpus/article_43.txt'
'./vn_news_corpus/article_44.txt'
'./vn_news_corpus/article_45.txt'
'./vn_news_corpus/article_46.txt'
'./vn_news_corpus/article_47.txt'
'./vn_news_corpus/article_48.txt'
'./vn_news_corpus/article_49.txt'
'./vn_news_corpus/article_50.txt'
'./vn_news_corpus/article_51.txt'
'./vn_news_corpus/article_52.txt'
'./vn_news_corpus/article_53.txt'
'./vn_news_corpus/article_54.txt'


 40%|████      | 4/10 [01:43<02:30, 25.11s/it]

'./vn_news_corpus/article_55.txt'
'./vn_news_corpus/article_56.txt'
'./vn_news_corpus/article_57.txt'
'./vn_news_corpus/article_58.txt'
'./vn_news_corpus/article_59.txt'
'./vn_news_corpus/article_60.txt'
'./vn_news_corpus/article_61.txt'
'./vn_news_corpus/article_62.txt'
'./vn_news_corpus/article_63.txt'
'./vn_news_corpus/article_64.txt'
'./vn_news_corpus/article_65.txt'
'./vn_news_corpus/article_66.txt'
'./vn_news_corpus/article_67.txt'
'./vn_news_corpus/article_68.txt'


 50%|█████     | 5/10 [02:08<02:04, 24.94s/it]

'./vn_news_corpus/article_69.txt'
'./vn_news_corpus/article_70.txt'
'./vn_news_corpus/article_71.txt'
'./vn_news_corpus/article_72.txt'
'./vn_news_corpus/article_73.txt'
'./vn_news_corpus/article_74.txt'
'./vn_news_corpus/article_75.txt'
'./vn_news_corpus/article_76.txt'
'./vn_news_corpus/article_77.txt'
'./vn_news_corpus/article_78.txt'
'./vn_news_corpus/article_79.txt'
'./vn_news_corpus/article_80.txt'
'./vn_news_corpus/article_81.txt'
'./vn_news_corpus/article_82.txt'


 60%|██████    | 6/10 [02:32<01:39, 24.92s/it]

'./vn_news_corpus/article_83.txt'
'./vn_news_corpus/article_84.txt'
'./vn_news_corpus/article_85.txt'
'./vn_news_corpus/article_86.txt'
'./vn_news_corpus/article_87.txt'
'./vn_news_corpus/article_88.txt'
'./vn_news_corpus/article_89.txt'
'./vn_news_corpus/article_90.txt'
'./vn_news_corpus/article_91.txt'
'./vn_news_corpus/article_92.txt'
'./vn_news_corpus/article_93.txt'
'./vn_news_corpus/article_94.txt'
'./vn_news_corpus/article_95.txt'
'./vn_news_corpus/article_96.txt'


 70%|███████   | 7/10 [02:57<01:14, 24.84s/it]

'./vn_news_corpus/article_97.txt'
'./vn_news_corpus/article_98.txt'
'./vn_news_corpus/article_99.txt'
'./vn_news_corpus/article_100.txt'
'./vn_news_corpus/article_101.txt'
'./vn_news_corpus/article_102.txt'
'./vn_news_corpus/article_103.txt'
'./vn_news_corpus/article_104.txt'
'./vn_news_corpus/article_105.txt'
'./vn_news_corpus/article_106.txt'
'./vn_news_corpus/article_107.txt'
'./vn_news_corpus/article_108.txt'
'./vn_news_corpus/article_109.txt'


 80%|████████  | 8/10 [03:22<00:49, 24.78s/it]

'./vn_news_corpus/article_110.txt'
'./vn_news_corpus/article_111.txt'
'./vn_news_corpus/article_112.txt'
'./vn_news_corpus/article_113.txt'
'./vn_news_corpus/article_114.txt'
'./vn_news_corpus/article_115.txt'
'./vn_news_corpus/article_116.txt'
'./vn_news_corpus/article_117.txt'
'./vn_news_corpus/article_118.txt'
'./vn_news_corpus/article_119.txt'
'./vn_news_corpus/article_120.txt'
'./vn_news_corpus/article_121.txt'
'./vn_news_corpus/article_122.txt'
'./vn_news_corpus/article_123.txt'
'./vn_news_corpus/article_124.txt'


 90%|█████████ | 9/10 [03:47<00:24, 24.78s/it]

'./vn_news_corpus/article_125.txt'
'./vn_news_corpus/article_126.txt'
'./vn_news_corpus/article_127.txt'
'./vn_news_corpus/article_128.txt'
'./vn_news_corpus/article_129.txt'
'./vn_news_corpus/article_130.txt'
'./vn_news_corpus/article_131.txt'
'./vn_news_corpus/article_132.txt'
'./vn_news_corpus/article_133.txt'
'./vn_news_corpus/article_134.txt'
'./vn_news_corpus/article_135.txt'
'./vn_news_corpus/article_136.txt'
'./vn_news_corpus/article_137.txt'


100%|██████████| 10/10 [04:10<00:00, 25.06s/it]

'./vn_news_corpus/article_138.txt'





In [70]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!zip -r /content/vn_news_corpus.zip /content/vn_news_corpus

In [75]:
!cp '/content/vn_news_corpus.zip' '/content/gdrive/MyDrive/Coordinate/2024/Module1/data_handling_project/dataset'