In [11]:
# âœ… Cell 1 â€” SETUP (Imports + basic configuration)
# ------------------------------------------------------------
# This cell:
# 1) Imports all required libraries
# 2) Sets up basic logging (optional but nice for debugging)
# ------------------------------------------------------------

# Core libraries
import pandas as pd
import time
import os
import logging
import matplotlib.pyplot as plt

# Selenium libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options




In [12]:
# âœ… Cell 2 â€” Start the browser and open the Wikipedia page
# ------------------------------------------------------------
# What this cell does:
# 1) Imports Selenium WebDriver (to control Chrome)
# 2) Imports ChromeDriverManager (to automatically download/use the right ChromeDriver)
# 3) Launches a Chrome browser session
# 4) Opens the target Wikipedia page in the browser
# ------------------------------------------------------------

# ChromeDriverManager automatically handles ChromeDriver installation + version matching
from webdriver_manager.chrome import ChromeDriverManager

# Selenium WebDriver is the tool that controls the browser
from selenium import webdriver

# Start a Chrome browser session
# NOTE: With webdriver_manager, it's best to pass the driver path it downloads.
driver = webdriver.Chrome()

# Target Wikipedia URL (main page we want to scrape)
page_url = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"

# Open the page in the Chrome browser
driver.get(page_url)


In [13]:
# âœ… Cell 3 â€” Scrape the full page content and save to a .txt file
# ------------------------------------------------------------
# This cell:
# 1) Waits for the page content to render
# 2) Finds the main Wikipedia content container (mw-content-text)
# 3) Extracts all visible text from that container
# 4) Saves the text as a .txt file in the current working directory
# 5) Prints the saved file location + basic stats
# 6) Closes the browser
# ------------------------------------------------------------

time.sleep(2)  # simple wait 

# Main article content area on Wikipedia
content = driver.find_element(By.ID, "mw-content-text")

# Extract visible text (includes headings, lists, paragraphs)
text = content.text

# Save to txt in the current working directory 
output_file = "key_events_20th_century.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(text)

print("âœ… Saved file to:", os.path.abspath(output_file))
print("ðŸ”¢ Characters written:", len(text))

driver.quit()
print("âœ… Browser closed.")

âœ… Saved file to: /Users/sonaabraham/key_events_20th_century.txt
ðŸ”¢ Characters written: 108969
âœ… Browser closed.
