# 1.4 Accessing web data without API

In [2]:
# Import libraries

import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4

In [3]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode for testing
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

In [4]:
# Use ChromeDriverManager to manage the driver
service = Service(ChromeDriverManager().install())

In [5]:
# Use the Service instance to create the webdriver.Chrome instance
driver = webdriver.Chrome(service=service, options=chrome_options)

## Scraping 20th Century Wiki page using selenium

In [6]:
# Get the page's contents

page_url = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"
driver.get(page_url)

In [7]:
# Create a collection of the characters/elements

character_elems = driver.find_elements(by = By.CLASS_NAME, value = 'div-col')

In [10]:
# Extract and print text from the elements
if character_elems:
    for elem in character_elems:
        print(elem.text)
else:
    print("No elements found with the specified class name.")

No elements found with the specified class name.


## Scraping Wiki 20th Century

In [11]:
from bs4 import BeautifulSoup
import requests

In [12]:
# Get URL

page_url =  requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")

In [14]:
# Get the page source and create a BeautifulSoup object
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [15]:
# Close driver
driver.quit()

In [20]:
# Extract all text from the page
print("Extracting text from the page...")
page_content = soup.get_text()

Extracting text from the page...


In [21]:
# Save the content to a .txt file
output_file = 'key_events_20th_century.txt'
print(f"Saving content to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(page_content)

Saving content to key_events_20th_century.txt...


In [23]:
# Get the absolute path of the file
absolute_path = os.path.abspath(output_file)

In [24]:
# Confirm the file has been created and print the file path
if os.path.exists(output_file):
    print(f"File {output_file} has been created successfully.")
    print(f"File path: {absolute_path}")
else:
    print(f"Failed to create the file {output_file}.")

File key_events_20th_century.txt has been created successfully.
File path: c:\Users\okumb\Downloads\20th_century\venv_20th\Scripts\key_events_20th_century.txt
