In [1]:
%pip install requests beautifulsoup4 lxml pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os, time, re
import pandas as pd
import requests
from bs4 import BeautifulSoup

TARGET_URL = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"  
assert TARGET_URL.startswith("http"), "Please paste a valid http(s) URL."

In [2]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0 Safari/537.36"
}
resp = requests.get(TARGET_URL, headers=headers, timeout=30)
resp.raise_for_status()
html = resp.text
len(html)

602295

In [3]:
soup = BeautifulSoup(html, "lxml")

def clean(txt: str) -> str:
    # normalize whitespace
    return re.sub(r"\s+", " ", txt).strip()

elements = []
for css in ["h1","h2","h3","h4","p","li"]:
    for el in soup.select(css):
        txt = clean(el.get_text(separator=" "))
        if txt:
            elements.append({"tag": css, "text": txt})

len(elements), elements[:10]

(547,
 [{'tag': 'h1', 'text': 'Key events of the 20th century'},
  {'tag': 'h2', 'text': 'Contents'},
  {'tag': 'h2', 'text': 'Historic events in the 20th century'},
  {'tag': 'h2', 'text': 'See also'},
  {'tag': 'h2', 'text': 'References'},
  {'tag': 'h2', 'text': 'Sources'},
  {'tag': 'h2', 'text': 'External links'},
  {'tag': 'h3', 'text': 'World at the beginning of the century'},
  {'tag': 'h3', 'text': 'Spanish flu'},
  {'tag': 'h3', 'text': 'Between the wars'}])

In [4]:
import numpy as np

df = pd.DataFrame(elements)
# Example heuristic: keep rows that look like years (1900–1999) or contain a 4-digit year
year_pat = r"\b(?:19\d{2}|20\d{2})\b"
df["has_year"] = df["text"].str.contains(year_pat, regex=True)
events_df = df[df["has_year"] | df["tag"].isin(["h2","h3","li"])].copy()

# (Optional) drop near-duplicates
events_df["text_norm"] = events_df["text"].str.lower()
events_df = events_df.drop_duplicates(subset=["text_norm"]).drop(columns=["text_norm"])

events_df.head(10)

Unnamed: 0,tag,text,has_year
1,h2,Contents,False
2,h2,Historic events in the 20th century,False
3,h2,See also,False
4,h2,References,False
5,h2,Sources,False
6,h2,External links,False
7,h3,World at the beginning of the century,False
8,h3,Spanish flu,False
9,h3,Between the wars,False
10,h3,Global war: World War II (1939–1945),True


In [5]:
OUTPUT_TXT = "key_events_20th_century.txt"

# choose which frame to save: `df` (all) OR `events_df` (filtered)
to_save = events_df if not events_df.empty else df

with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
    for _, row in to_save.iterrows():
        f.write(f"[{row['tag']}] {row['text']}\n")

print(f"Saved {len(to_save)} lines to {OUTPUT_TXT}")
os.listdir()


Saved 471 lines to key_events_20th_century.txt


['.git',
 '.ipynb_checkpoints',
 '20th-Century-Scrape.ipynb',
 '20th_century_demo.ipynb',
 '20th_century_text_analysis.ipynb',
 'key_events_20th_century.csv',
 'key_events_20th_century.txt',
 'README.md',
 'requirements_20th_century.txt',
 'requirements_20th_century.txt.lnk',
 'STask_3_LambdaF.ipynb',
 'Test.NB.S.ipynb',
 'test_notebook.ipynb']

In [6]:
import re

# Compile the pattern once so we can use .search()
year_pat = re.compile(r"\b(18|19|20)\d{2}\b")

def first_year(s: str):
    """Find the first 4-digit year (1800–2099) in a string."""
    if not isinstance(s, str):
        return None
    m = year_pat.search(s)
    return m.group(0) if m else None

# Now safely apply to your text column
events_df["year"] = events_df["text"].apply(first_year)
events_df["event"] = events_df["text"]
events_df[["year", "event"]].head(10)


Unnamed: 0,year,event
1,,Contents
2,,Historic events in the 20th century
3,,See also
4,,References
5,,Sources
6,,External links
7,,World at the beginning of the century
8,,Spanish flu
9,,Between the wars
10,1939.0,Global war: World War II (1939–1945)


In [7]:
events_df[["year","event"]].to_csv("key_events_20th_century.csv", index=False, encoding="utf-8")