In [1]:
import spacy
import pandas as pd
from itertools import combinations

  import pkg_resources


In [2]:
with open("20th_century_key_events.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("Characters in text:", len(text))
print("\nPreview:\n")
print(text[:500])

Characters in text: 110314

Preview:

Jump to content
Main menu
Search
Appearance
Donate
Create account
Log in
Wiki Loves Folklore
Photograph your local culture, help Wikipedia and win!
PARTICIPATE NOW
[Help with translations!]
Toggle the table of contents
Key events of the 20th century
2 languages
Article
Talk
Read
Edit
View history
Tools
From Wikipedia, the free encyclopedia
The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold W


In [3]:
bad_starts = (
    "Jump to content",
    "Main menu",
    "Search",
    "Appearance",
    "Donate",
    "Create account",
    "Log in",
    "Wiki Loves",
    "Toggle the table of contents",
    "2 languages",
    "Article",
    "Talk",
    "Read",
    "Edit",
    "View history",
    "Tools"
)

clean_lines = []
for line in text.splitlines():
    stripped = line.strip()
    if stripped.startswith(bad_starts):
        continue
    if stripped == "":
        continue
    clean_lines.append(stripped)

clean_text = "\n".join(clean_lines)

with open("20th_century_key_events_clean.txt", "w", encoding="utf-8") as f:
    f.write(clean_text)

print(clean_text[:500])

Photograph your local culture, help Wikipedia and win!
PARTICIPATE NOW
[Help with translations!]
Key events of the 20th century
From Wikipedia, the free encyclopedia
The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the space race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and s


### Text Wrangling Observations

The scraped text included navigation elements and metadata from Wikipedia 
(e.g., "Jump to content", "Main menu", "Tools", etc.). 

These elements are not part of the article content and would interfere 
with Named Entity Recognition (NER) by introducing irrelevant entities.

Therefore:
- Navigation and interface text was removed.
- Empty lines were excluded.
- The cleaned version was saved as a new `.txt` file for analysis.

This ensures that the NER model processes only meaningful article content.

In [4]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(clean_text)

# Preview first 20 named entities
[(ent.text, ent.label_) for ent in doc.ents[:20]]

[('Wikipedia', 'GPE'),
 ('the 20th century', 'DATE'),
 ('Wikipedia', 'ORG'),
 ('The 20th century', 'DATE'),
 ('the Cold War', 'EVENT'),
 ('the 21st century', 'DATE'),
 ('today', 'DATE'),
 ('the 20th', 'DATE'),
 ('Edwardian', 'NORP'),
 ('1914', 'DATE'),
 ('the 20th century', 'DATE'),
 ('The 1900s', 'DATE'),
 ('the decade', 'DATE'),
 ('1914', 'DATE'),
 ('the Panama Canal', 'LOC'),
 ('Africa', 'LOC'),
 ('the 1900s', 'DATE'),
 ('the Congo Free State', 'ORG'),
 ('1914', 'DATE'),
 ('the First World War', 'EVENT')]

In [5]:
sentence_entities = []

for sent in doc.sents:
    entities = [ent.text for ent in sent.ents if ent.label_ == "GPE"]
    if len(entities) > 1:
        sentence_entities.append(entities)

sentence_entities[:10]

[['British Empire', 'France', 'the Russian Empire', 'Empire'],
 ['Russia', 'Tsar'],
 ['Germany', 'Russia'],
 ['Germany', 'the Ottoman Empire'],
 ['Yugoslavia', 'Czechoslovakia'],
 ['Germany', 'Italy'],
 ['Germany', 'Germany'],
 ['Germany', 'the United States'],
 ['Austria', 'Germany'],
 ['Moscow', 'Czechoslovakia', 'Britain', 'France', 'Poland']]

In [6]:
# Normalize extracted GPE entities to better match our lookup list
alias_map = {
    "Britain": "United Kingdom",
    "UK": "United Kingdom",
    "U.K.": "United Kingdom",
    "US": "United States",
    "U.S.": "United States",
    "the United States": "United States",
    "the Soviet Union": "Soviet Union",
}

def normalize_entity(e: str) -> str:
    e = e.strip()
    if e.lower().startswith("the "):
        e = e[4:]
    return alias_map.get(e, e)

sentence_entities_norm = [
    [normalize_entity(e) for e in sent]
    for sent in sentence_entities
]

In [7]:
countries = [
    "United States","Germany","France","United Kingdom","Russia","China","Japan",
    "Italy","Spain","Canada","Australia","India","Brazil","Mexico","Soviet Union",
    "Poland","Austria","Hungary","Turkey","Netherlands","Belgium","Sweden",
    "Norway","Finland","Denmark","Greece","Portugal","Switzerland","Ireland"
]

filtered_sentences = []

for sentence in sentence_entities_norm:
    filtered = [country for country in sentence if country in countries]
    if len(filtered) > 1:
        filtered_sentences.append(filtered)

filtered_sentences[:10]

[['Germany', 'Russia'],
 ['Germany', 'Italy'],
 ['Germany', 'Germany'],
 ['Germany', 'United States'],
 ['Austria', 'Germany'],
 ['United Kingdom', 'France', 'Poland'],
 ['United Kingdom', 'France', 'Germany', 'Poland', 'Poland', 'Soviet Union'],
 ['Poland', 'Soviet Union', 'Germany'],
 ['Germany', 'Soviet Union'],
 ['Denmark', 'Norway']]

In [8]:
from itertools import combinations
import pandas as pd

relationships = []

for sentence in filtered_sentences:
    for pair in combinations(set(sentence), 2):
        relationships.append(pair)

df_relationships = pd.DataFrame(relationships, columns=["Country1", "Country2"])

df_relationships.head()

Unnamed: 0,Country1,Country2
0,Germany,Russia
1,Germany,Italy
2,Germany,United States
3,Austria,Germany
4,United Kingdom,France


In [9]:
df_relationships.to_csv("country_relationships.csv", index=False)

df_relationships.shape

(109, 2)