### Imports:

In [2]:
import os
from pathlib import Path
import re
import itertools
import collections

import pandas as pd
import spacy

### Project folder in the notebook:

In [3]:
os.chdir(r"C:\Users\User\Downloads\20th-Century")
print("Now in:", os.getcwd())

Now in: C:\Users\User\Downloads\20th-Century


### Loading the text file:

In [4]:
DATA_TXT = "key_events_20th_century.txt"
assert Path(DATA_TXT).exists(), f"Can't find {DATA_TXT} in {os.getcwd()}"

with open(DATA_TXT, "r", encoding="utf-8") as f:
    lines = [ln.strip() for ln in f if ln.strip()]
text = "\n".join(lines)

len(text), text[:500]

(89070,
 '[h2] Contents\n[h2] Historic events in the 20th century\n[h2] See also\n[h2] References\n[h2] Sources\n[h2] External links\n[h3] World at the beginning of the century\n[h3] Spanish flu\n[h3] Between the wars\n[h3] Global war: World War II (1939–1945)\n[h3] The post-war world\n[h3] The world at the end of the century\n[h4] "The war to end all wars": World War I (1914–1918)\n[h4] The Cold War (1947–1991)\n[p] The new beginning of the 20th century marked significant changes. The 1900s saw the decade herald a ser')

### Inspecting weird characters & country name consistency:

In [5]:
# Look for non-ASCII characters
weird_chars = sorted({ch for ch in text if ord(ch) > 127})
weird_chars

['®',
 'ã',
 'é',
 'í',
 'ö',
 'У',
 'а',
 'к',
 'н',
 'р',
 'с',
 'ь',
 'ї',
 'ا',
 'ب',
 'ة',
 'ر',
 'ع',
 'ل',
 'ي',
 '–',
 '—',
 '’',
 '沖',
 '県',
 '縄',
 '／']

In [6]:
# Sample random lines to see how country names look in context
for i, line in enumerate(lines[:50], start=1):
    print(f"{i:3}: {line}")

  1: [h2] Contents
  2: [h2] Historic events in the 20th century
  3: [h2] See also
  4: [h2] References
  5: [h2] Sources
  6: [h2] External links
  7: [h3] World at the beginning of the century
  8: [h3] Spanish flu
  9: [h3] Between the wars
 10: [h3] Global war: World War II (1939–1945)
 11: [h3] The post-war world
 12: [h3] The world at the end of the century
 13: [h4] "The war to end all wars": World War I (1914–1918)
 14: [h4] The Cold War (1947–1991)
 15: [p] The new beginning of the 20th century marked significant changes. The 1900s saw the decade herald a series of inventions, including the automobile , airplane and radio broadcasting . 1914 saw the completion of the Panama Canal .
 16: [p] From 1914 to 1918, the First World War, and its aftermath, caused major changes in the power balance of the world, destroying or transforming some of the most powerful empires.
 17: [p] The First World War (or simply WWI), termed "The Great War" by contemporaries, started in July 1914 and 

### Text wrangling notes

- The text includes some special characters like en-dashes (–) and curly quotes (“ ”), but these do not break tokenization or NER.
- Country names appear mostly in standard forms (e.g., "United States", "United Kingdom", "Germany").
- Some historical names appear, like "Soviet Union" and "USSR". I will treat these as Russia in the country lookup.
- No major cleaning required besides normalizing whitespace; I will keep the original text but could normalize quotes/dashes if needed.

### Normalizing special characters:

In [7]:
cleaned = text.replace("\u2013", "-").replace("\u2014", "-")  # en/em dashes -> "-"
cleaned = cleaned.replace("\u2019", "'").replace("\u201c", '"').replace("\u201d", '"')

# Use cleaned text going forward
text = cleaned

### Cleaned file:

In [8]:
CLEAN_TXT = "key_events_20th_century_clean.txt"
with open(CLEAN_TXT, "w", encoding="utf-8") as f:
    f.write(text)
print("Saved cleaned text to", CLEAN_TXT)

Saved cleaned text to key_events_20th_century_clean.txt


### Loading spaCy model & creating doc:

In [9]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
len(list(doc.sents)), len(doc)

(1555, 19598)

### Collecting entities per sentence:

In [10]:
sentence_entities = []  # list of dicts: {"sent_id": int, "sentence": str, "ents": [(text, label), ...]}

for i, sent in enumerate(doc.sents):
    ents = [(ent.text, ent.label_) for ent in sent.ents]
    sentence_entities.append({
        "sent_id": i,
        "sentence": sent.text.strip(),
        "entities": ents,
    })

len(sentence_entities), sentence_entities[:3]

(1555,
 [{'sent_id': 0,
   'sentence': '[h2] Contents\n[h2] Historic events in the 20th century\n[h2] See also\n[h2] References\n[h2] Sources\n[h2] External links',
   'entities': [('Historic', 'PERSON'), ('the 20th century', 'DATE')]},
  {'sent_id': 1,
   'sentence': '[h3] World at the beginning of the century\n[h3] Spanish flu',
   'entities': [('the beginning of the century', 'DATE'),
    ('Spanish', 'NORP')]},
  {'sent_id': 2, 'sentence': '[h3] Between the wars', 'entities': []}])

#### Dataframe:

In [11]:
sent_df = pd.DataFrame(sentence_entities)
sent_df.head()

Unnamed: 0,sent_id,sentence,entities
0,0,[h2] Contents\n[h2] Historic events in the 20t...,"[(Historic, PERSON), (the 20th century, DATE)]"
1,1,[h3] World at the beginning of the century\n[h...,"[(the beginning of the century, DATE), (Spanis..."
2,2,[h3] Between the wars,[]
3,3,[h3] Global war: World War II (1939-1945),"[(World War II, EVENT), (1939-1945, DATE)]"
4,4,[h3] The post-war world,[]


### Simple curated country list + synonyms:

In [12]:
countries = [
    "United States", "USA", "United Kingdom", "UK", "Britain", "England",
    "France", "Germany", "Italy", "Spain", "Russia", "Soviet Union", "USSR",
    "China", "Japan", "India", "Pakistan", "Israel", "Egypt", "Turkey",
    "Vietnam", "Korea", "North Korea", "South Korea", "Canada",
    "Australia", "New Zealand", "South Africa", "Brazil", "Argentina"
]

# Normalize base country names (what we want in the network)
country_aliases = {
    "USA": "United States",
    "U.S.": "United States",
    "United States of America": "United States",
    "Britain": "United Kingdom",
    "England": "United Kingdom",
    "Soviet Union": "Russia",
    "USSR": "Russia",
    "Korean": "Korea",  # optional / context dependent
}

def normalize_country(name: str) -> str | None:
    name = name.strip()
    # alias mapping
    if name in country_aliases:
        return country_aliases[name]
    # exact match
    if name in countries:
        return name
    # simple title-case check
    if name.title() in countries:
        return name.title()
    return None

### Build a sentence → set of normalized country names:

In [13]:
sent_country_map = []  # each row: {"sent_id", "sentence", "countries": set([...])}

for row in sentence_entities:
    sent_id = row["sent_id"]
    sent_text = row["sentence"]
    ents = row["entities"]
    
    countries_in_sent = set()
    for ent_text, label in ents:
        if label in ("GPE", "LOC"):
            norm = normalize_country(ent_text)
            if norm:
                countries_in_sent.add(norm)
    
    if countries_in_sent:
        sent_country_map.append({
            "sent_id": sent_id,
            "sentence": sent_text,
            "countries": countries_in_sent,
        })

len(sent_country_map), sent_country_map[:5]

(120,
 [{'sent_id': 13,
   'sentence': 'After a period of diplomatic and military escalation known as the July Crisis , by the end of July 1914 two coalitions were at war: the Allies, comprised initially of the British Empire , France , and the Russian Empire ; and the Central Powers , comprised initially of the German Empire and Austria-Hungary .',
   'countries': {'France'}},
  {'sent_id': 14,
   'sentence': '[ 1 ] [ 2 ]\n[p] In 1917, Russia ended hostile actions against the Central Powers after the fall of the Tsar.',
   'countries': {'Russia'}},
  {'sent_id': 15,
   'sentence': 'The Bolsheviks negotiated the Treaty of Brest-Litovsk with Germany, although it was a huge cost to Russia.',
   'countries': {'Germany', 'Russia'}},
  {'sent_id': 16,
   'sentence': 'In the treaty, Bolshevik Russia ceded the Baltic states to Germany, and its province of Kars Oblast in the South Caucasus to the Ottoman Empire .',
   'countries': {'Germany'}},
  {'sent_id': 18,
   'sentence': '[ 3 ] Although 

#### Dataframe:

In [14]:
countries_sent_df = pd.DataFrame(sent_country_map)
countries_sent_df.head()

Unnamed: 0,sent_id,sentence,countries
0,13,After a period of diplomatic and military esca...,{France}
1,14,"[ 1 ] [ 2 ]\n[p] In 1917, Russia ended hostile...",{Russia}
2,15,The Bolsheviks negotiated the Treaty of Brest-...,"{Germany, Russia}"
3,16,"In the treaty, Bolshevik Russia ceded the Balt...",{Germany}
4,18,[ 3 ] Although Germany shifted huge forces fro...,{Germany}


### Building pair counts:

In [15]:
pair_counter = collections.Counter()

for row in sent_country_map:
    countries_in_sent = sorted(row["countries"])
    if len(countries_in_sent) < 2:
        continue  # need at least 2 to form a relationship
    # all unique pairs within this sentence
    for c1, c2 in itertools.combinations(countries_in_sent, 2):
        pair_counter[(c1, c2)] += 1

len(pair_counter), list(pair_counter.items())[:10]

(40,
 [(('Germany', 'Russia'), 2),
  (('France', 'United Kingdom'), 3),
  (('France', 'Germany'), 3),
  (('Germany', 'United Kingdom'), 1),
  (('Germany', 'Japan'), 2),
  (('Japan', 'Russia'), 1),
  (('Germany', 'Italy'), 2),
  (('China', 'Japan'), 3),
  (('Italy', 'Japan'), 2),
  (('India', 'Japan'), 1)])

### Converting to dataframe for export:

In [16]:
relations = [
    {"source": c1, "target": c2, "weight": w}
    for (c1, c2), w in pair_counter.items()
]

relations_df = pd.DataFrame(relations).sort_values("weight", ascending=False)
relations_df.head(20)

Unnamed: 0,source,target,weight
7,China,Japan,3
2,France,Germany,3
1,France,United Kingdom,3
0,Germany,Russia,2
13,China,North Korea,2
8,Italy,Japan,2
23,India,Pakistan,2
6,Germany,Italy,2
4,Germany,Japan,2
30,Pakistan,Russia,1


### Build country relationships using 5-word window:

In [19]:
import re
import pandas as pd
from collections import Counter
from itertools import combinations

countries = [
    "France", "Germany", "United Kingdom", "United States", "Russia", "Italy",
]
countries_set = set(countries)


In [21]:
with open("key_events_20th_century_clean.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

In [22]:
window_size = 5
pair_counter = Counter()

for i in range(len(tokens)):
    # current window: token i and the next 4 tokens
    window_tokens = tokens[i:i+window_size]
    
    # which countries appear in this window?
    window_countries = [t for t in window_tokens if t in countries_set]
    
    # get all unique unordered pairs of countries in the window
    for c1, c2 in combinations(sorted(set(window_countries)), 2):
        pair_counter[(c1, c2)] += 1

In [26]:
{("France", "Germany"): 123, ("United States", "United Kingdom"): 89}

{('France', 'Germany'): 123, ('United States', 'United Kingdom'): 89}

### Dataframe:

In [27]:
rows = []
for (c1, c2), count in pair_counter.items():
    rows.append({"source": c1, "target": c2, "weight": count})

relationships_df = pd.DataFrame(rows)
relationships_df.sort_values("weight", ascending=False, inplace=True)

relationships_df.head()

Unnamed: 0,source,target,weight
1,germany,italy,10
0,germany,russia,3
2,italy,russia,1
3,france,germany,1


### CSV file:

In [28]:
REL_CSV = "country_relationships_20th_century.csv"
relations_df.to_csv(REL_CSV, index=False, encoding="utf-8")
REL_CSV, relations_df.shape

('country_relationships_20th_century.csv', (40, 3))