### Install requirements txt file 
    pip install -r requirements_1.6.txt

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.8 MB/s eta 0:00:07
     --- ------------------------------------ 1.0/12.8 MB 1.9 MB/s eta 0:00:07
     ---- ----------------------------------- 1.3/12.8 MB 1.9 MB/s eta 0:00:07
     ----- ---------------------------------- 1.8/12.8 MB 1.9 MB/s eta 0:00:06
     ------- -------------------------------- 2.4/12.8 MB 2.0 MB/s eta 0:00:06
     -------- ------------------------------- 2.6/12.8 MB 2.0 MB/s eta 0:00:06
     --------- ------------------------------ 3.1/12.8 MB 2.0 MB/s eta 0:00:05
     ----------- ---------------------------- 3.7/12.8 MB 2.0 MB/s eta 0:00:05
     ------------- -------------------------- 4.

In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

### 3.	Load the twentieth-century text file.

In [4]:
# Load the text file

with open('key_events_20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', ' ')

In [5]:
print(data[:1000])  # Preview first 1000 characters

The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today. Historic events in the 20th century[edit] World at the beginning of the century[edit] Main article: Edwardian era The new beginning of the 20th century marked significant changes. The 1900s saw the decade herald a series of inventions, including the automobile, airplane and radio broadcasting. 1914 saw the completion of the Panama Canal. The Scramble for Africa continued in the 1900s and resulted in wars and genocide across the continent. The atrocities in the Congo Free State shocked the civilized world. From 1914 to 1918, the First World War, and its aftermath, caused major changes in the power balance of the w

In [6]:
# Load countries from CSV using pandas
country_df = pd.read_csv('countries_list_20th_century_1.5.csv')

# Convert to a list and strip whitespace
country_list = country_df['country_name'].str.strip().str.lower().tolist()

In [7]:
print(country_list) 

['afghanistan', 'albania', 'algeria', 'andorra', 'angola', 'antigua and barbuda', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas, the', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia and herzegovina', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina faso', 'burundi', 'cambodia', 'cameroon', 'canada', 'cape verde', 'central african republic', 'chad', 'chile', "china, people's republic of", 'colombia', 'comoros', 'congo, democratic republic of the', 'congo, republic of the', 'costa rica', 'croatia', 'cuba', 'cyprus', 'czech republic', 'denmark', 'djibouti', 'dominica', 'dominican republic', 'east timor', 'ecuador', 'egypt', 'el salvador', 'equatorial guinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'gambia, the', 'georgia', 'germany', 'ghana', 'greece', 'grenada', 'guatemala', 'guinea', 'guinea', 'bissau', 'guyana', 'haiti', 'honduras', 'hungary', 'iceland',

### 4.	Evaluate whether the text needs wrangling—are there any special characters used? Are the names of the countries in your list the same as the names in the text? Write down your observations in a markdown cell and take the necessary steps to correct any issues you’ve found. If anything does need correcting, make sure you save your file as a .txt

In [8]:
type(data)

str

In [9]:
# Fix and clean the text
cleaned = data.lower()
cleaned = re.sub(r"[^a-z0-9\s.,'’“”‘-]", " ", cleaned)  # Replace non-standard chars with space
cleaned = re.sub(r"\s+", " ", cleaned).strip()  # Normalize whitespace


#### Removed all special characters

In [28]:
print(cleaned[:100])

the 20th century changed the world in unprecedented ways. the world wars sparked tension between cou


In [44]:
# Save cleaned
with open("cleaned_text.txt", "w", encoding="utf-8") as f:
    f.write(cleaned)


In [21]:
country_aliases = {
    'korea, north': 'north korea',
    'korea, south': 'south korea',
    "china, people's republic of": 'china',
    'congo, democratic republic of the': 'congo',
    # add more as needed
}


In [26]:
found_countries = []
not_found_countries = []

for country in country_list:
    # Use alias if available, else use original
    alias = country_aliases.get(country, country)
    
    if re.search(rf'\b{re.escape(alias)}\b', cleaned):
        found_countries.append(country)
    else:
        not_found_countries.append(country)

print("Countries found in text:", found_countries)

Countries found in text: ['afghanistan', 'albania', 'algeria', 'angola', 'australia', 'austria', 'bangladesh', 'belarus', 'belgium', 'bulgaria', 'cambodia', 'canada', 'cape verde', "china, people's republic of", 'congo, democratic republic of the', 'cuba', 'denmark', 'egypt', 'estonia', 'finland', 'france', 'germany', 'ghana', 'greece', 'guinea', 'guinea', 'bissau', 'hungary', 'india', 'iran', 'iraq', 'israel', 'italy', 'japan', 'kenya', 'korea, north', 'korea, south', 'laos', 'latvia', 'lebanon', 'libya', 'lithuania', 'luxembourg', 'mexico', 'mongolia', 'morocco', 'mozambique', 'netherlands', 'norway', 'pakistan', 'panama', 'papua new guinea', 'philippines', 'poland', 'romania', 'russia', 'rwanda', 'seychelles', 'singapore', 'solomon islands', 'south africa', 'spain', 'sweden', 'thailand', 'ukraine', 'united kingdom', 'united states', 'vietnam']


found_countries = []
not_found_countries = []

for country in country_list:
    alias = country_aliases.get(country, country)  # Use alias if available
    if re.search(rf'\b{re.escape(alias)}\b', cleaned):
        found_countries.append(country)
    else:
        not_found_countries.append(country)

In [27]:
print("Countries not found in text:", not_found_countries)

Countries not found in text: ['andorra', 'antigua and barbuda', 'argentina', 'armenia', 'azerbaijan', 'bahamas, the', 'bahrain', 'barbados', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia and herzegovina', 'botswana', 'brazil', 'brunei', 'burkina faso', 'burundi', 'cameroon', 'central african republic', 'chad', 'chile', 'colombia', 'comoros', 'congo, republic of the', 'costa rica', 'croatia', 'cyprus', 'czech republic', 'djibouti', 'dominica', 'dominican republic', 'east timor', 'ecuador', 'el salvador', 'equatorial guinea', 'eritrea', 'eswatini', 'ethiopia', 'fiji', 'gabon', 'gambia, the', 'georgia', 'grenada', 'guatemala', 'guyana', 'haiti', 'honduras', 'iceland', 'indonesia', 'ireland', 'ivory coast', 'jamaica', 'jordan', 'kazakhstan', 'kiribati', 'kuwait', 'kyrgyzstan', 'lesotho', 'liberia', 'liechtenstein', 'madagascar', 'malawi', 'malaysia', 'maldives', 'mali', 'malta', 'marshall islands', 'mauritania', 'mauritius', 'micronesia, federated states of', 'moldova', 'monaco', 'monten

#### Found which countries from the list are and are not in the text. Created aliases for some countries that I verified existed in the text but were not matching. 

### 5.	Use the text file to create a NER object.

In [30]:
book = NER(cleaned)

### 6.	Split the sentence entities from the NER object.

In [33]:
entities = [ent.text for ent in book.ents]
print(entities)

['the 20th century', 'the 21st century', 'today', 'the 20th century', 'the beginning of the century', 'the 20th century', 'the 1900s', 'the decade', '1914', 'the panama canal', 'africa', 'the 1900s', '1914 to 1918', 'first', '1914', '1918', 'franz', 'first', 'july 1914', 'november 1918', 'franz', 'july', 'the end of july 1914', 'british', 'france', 'russian', 'german', 'austria-hungary', '1 2', '1917', 'russia', 'germany', 'russia', 'russia', 'germany', '3', 'germany', 'american', '1918', '4', 'germans', '5', '6', '7', 'british', '8', '9', 'austria-hungary', '1918', 'the next twenty years', 'first', 'germans', 'germans', '10', 'germany', 'kaiser wilhelm', '11', 'europe', 'new states', 'yugoslavia', 'czechoslovakia', '12', '13', 'spanish', '1918', '1920', 'between 17 to 100 million', '14 15', '16', 'one', '17', '1917', 'nicholas ii', 'communist', 'europe', '19', 'european', 'vladimir lenin', '1924', 'a few years', 'joseph stalin', 'the soviet union', 'one', '20', '21', 'socialists', 'st

### 7.	Filter the entities so that you end up only with the ones from your countries list

In [34]:
# Extract entities labeled as GPE or LOC (already lowercase)
entities = [ent.text.lower() for ent in book.ents if ent.label_ in ("GPE", "LOC")]

In [35]:
# Filter entities to keep only those present in your country_list
filtered_countries = [ent for ent in entities if ent in country_list]

In [36]:
# Optional: Remove duplicates but keep original order
filtered_countries = list(dict.fromkeys(filtered_countries))

In [37]:
print("Countries found in text:", filtered_countries)

Countries found in text: ['france', 'russia', 'germany', 'italy', 'austria', 'spain', 'poland', 'estonia', 'latvia', 'lithuania', 'finland', 'denmark', 'norway', 'netherlands', 'luxembourg', 'greece', 'albania', 'ukraine', 'libya', 'egypt', 'iraq', 'iran', 'japan', 'morocco', 'algeria', 'thailand', 'singapore', 'australia', 'solomon islands', 'india', 'pakistan', 'israel', 'south africa', 'laos', 'cambodia', 'vietnam', 'kenya', 'ghana', 'bulgaria', 'romania', 'hungary', 'mongolia', 'cuba', 'canada', 'united states', 'rwanda', 'philippines', 'afghanistan', 'lebanon']


### Try again with aliases

In [38]:
# Your alias mapping
country_aliases = {
    'korea, north': 'north korea',
    'korea, south': 'south korea',
    "china, people's republic of": 'china',
    'congo, democratic republic of the': 'congo',
    # add more as needed
}

# Create a reverse alias map for quick lookup: alias → canonical name
# Example: 'north korea' → 'korea, north'
reverse_aliases = {v: k for k, v in country_aliases.items()}

# Extract entities labeled as GPE or LOC (already lowercase)
entities = [ent.text.lower() for ent in book.ents if ent.label_ in ("GPE", "LOC")]

filtered_countries = []

for ent in entities:
    # Map alias back to canonical if exists, else keep original
    canonical_name = reverse_aliases.get(ent, ent)
    
    # Check if canonical name is in your country list
    if canonical_name in country_list:
        filtered_countries.append(canonical_name)

# Remove duplicates but preserve order
filtered_countries = list(dict.fromkeys(filtered_countries))

print("Countries found in text (canonical names):", filtered_countries)


Countries found in text (canonical names): ['france', 'russia', 'germany', 'italy', 'austria', 'spain', 'poland', 'estonia', 'latvia', 'lithuania', 'finland', 'denmark', 'norway', 'netherlands', 'luxembourg', 'greece', 'albania', 'ukraine', 'libya', 'egypt', 'iraq', 'iran', 'japan', 'morocco', 'algeria', "china, people's republic of", 'thailand', 'singapore', 'australia', 'solomon islands', 'india', 'pakistan', 'israel', 'korea, north', 'south africa', 'laos', 'cambodia', 'vietnam', 'kenya', 'ghana', 'bulgaria', 'romania', 'hungary', 'mongolia', 'cuba', 'korea, south', 'canada', 'united states', 'rwanda', 'philippines', 'afghanistan', 'lebanon']


### 8.	Create the relationships dataframe

In [39]:
import itertools

# Generate unique country pairs (no duplicates like (germany, germany))
pairs = list(itertools.combinations(filtered_countries, 2))

# Create DataFrame
relationships_df = pd.DataFrame(pairs, columns=['country_1', 'country_2'])

print(relationships_df)


        country_1    country_2
0          france       russia
1          france      germany
2          france        italy
3          france      austria
4          france        spain
...           ...          ...
1321       rwanda  afghanistan
1322       rwanda      lebanon
1323  philippines  afghanistan
1324  philippines      lebanon
1325  afghanistan      lebanon

[1326 rows x 2 columns]


In [42]:
# Step 1: Generate all pairwise combinations (excluding self-pairs)
pairs = list(itertools.combinations(filtered_countries, 2))

# Step 2: Sort each pair alphabetically so (france, germany) == (germany, france)
sorted_pairs = [tuple(sorted(pair)) for pair in pairs]

# Step 3: Create a DataFrame and count occurrences
relationships_df = pd.DataFrame(sorted_pairs, columns=['country_1', 'country_2'])
relationships_df = relationships_df.value_counts().reset_index(name='mentions')

# Step 4: Sort by number of mentions (descending)
relationships_df = relationships_df.sort_values(by='mentions', ascending=False)

print(relationships_df)

          country_1      country_2  mentions
0       afghanistan        albania         1
890     afghanistan      australia         1
888     afghanistan       bulgaria         1
887     afghanistan       cambodia         1
886     afghanistan         canada         1
...             ...            ...       ...
441        thailand  united states         1
440        thailand        vietnam         1
439         ukraine  united states         1
438         ukraine        vietnam         1
1325  united states        vietnam         1

[1326 rows x 3 columns]


### 9.	Save and export your dataframe.

In [43]:
relationships_df.to_csv("country_relationships.csv", index=False)