### Install requirements txt file 
    pip install -r requirements_1.6.txt

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 4.1 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 4.0 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 4.1 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 4.1 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 4.1 MB/s eta 0:00:03
     ---------------- ----------------------- 5.2/12.8 MB 4.1 MB/s eta 0:00:02
     ------------------- -------------------- 6.3/12.8 MB 4.1 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.1 MB/s eta 0:00:02
     ----------------------- ---------------- 7.

In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

### 3.	Load the twentieth-century text file.

In [4]:
# Load the text file

with open('key_events_20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', ' ')

In [5]:
print(data[:1000])  # Preview first 1000 characters

The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today. Historic events in the 20th century[edit] World at the beginning of the century[edit] Main article: Edwardian era The new beginning of the 20th century marked significant changes. The 1900s saw the decade herald a series of inventions, including the automobile, airplane and radio broadcasting. 1914 saw the completion of the Panama Canal. The Scramble for Africa continued in the 1900s and resulted in wars and genocide across the continent. The atrocities in the Congo Free State shocked the civilized world. From 1914 to 1918, the First World War, and its aftermath, caused major changes in the power balance of the w

In [6]:
# Load countries from CSV using pandas
country_df = pd.read_csv('countries_list_20th_century_1.5.csv')

# Convert to a list and strip whitespace
country_list = country_df['country_name'].str.strip().str.lower().tolist()

In [7]:
print(country_list) 

['afghanistan', 'albania', 'algeria', 'andorra', 'angola', 'antigua and barbuda', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas, the', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia and herzegovina', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina faso', 'burundi', 'cambodia', 'cameroon', 'canada', 'cape verde', 'central african republic', 'chad', 'chile', "china, people's republic of", 'colombia', 'comoros', 'congo, democratic republic of the', 'congo, republic of the', 'costa rica', 'croatia', 'cuba', 'cyprus', 'czech republic', 'denmark', 'djibouti', 'dominica', 'dominican republic', 'east timor', 'ecuador', 'egypt', 'el salvador', 'equatorial guinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'gambia, the', 'georgia', 'germany', 'ghana', 'greece', 'grenada', 'guatemala', 'guinea', 'guinea', 'bissau', 'guyana', 'haiti', 'honduras', 'hungary', 'iceland',

### 4.	Evaluate whether the text needs wrangling—are there any special characters used? Are the names of the countries in your list the same as the names in the text? Write down your observations in a markdown cell and take the necessary steps to correct any issues you’ve found. If anything does need correcting, make sure you save your file as a .txt

In [8]:
type(data)

str

In [9]:
# Fix and clean the text
cleaned = data.lower()
cleaned = re.sub(r"[^a-z0-9\s.,'’“”‘-]", " ", cleaned)  # Replace non-standard chars with space
cleaned = re.sub(r"\s+", " ", cleaned).strip()  # Normalize whitespace


#### Removed all special characters

In [10]:
print(cleaned[:1000])

the 20th century changed the world in unprecedented ways. the world wars sparked tension between countries and led to the creation of atomic bombs, the cold war led to the space race and the creation of space-based rockets, and the world wide web was created. these advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today. historic events in the 20th century edit world at the beginning of the century edit main article edwardian era the new beginning of the 20th century marked significant changes. the 1900s saw the decade herald a series of inventions, including the automobile, airplane and radio broadcasting. 1914 saw the completion of the panama canal. the scramble for africa continued in the 1900s and resulted in wars and genocide across the continent. the atrocities in the congo free state shocked the civilized world. from 1914 to 1918, the first world war, and its aftermath, caused major changes in the power balance of the worl

In [11]:
# Save cleaned
with open("cleaned_text.txt", "w", encoding="utf-8") as f:
    f.write(cleaned)


In [12]:
country_aliases = {
    'korea, north': 'north korea',
    'korea, south': 'south korea',
    "china, people's republic of": 'china',
    'congo, democratic republic of the': 'congo',
    # add more as needed
}


In [13]:
found_countries = []
not_found_countries = []

for country in country_list:
    # Use alias if available, else use original
    alias = country_aliases.get(country, country)
    
    if re.search(rf'\b{re.escape(alias)}\b', cleaned):
        found_countries.append(country)
    else:
        not_found_countries.append(country)

print("Countries found in text:", found_countries)

Countries found in text: ['afghanistan', 'albania', 'algeria', 'angola', 'australia', 'austria', 'bangladesh', 'belarus', 'belgium', 'bulgaria', 'cambodia', 'canada', 'cape verde', "china, people's republic of", 'congo, democratic republic of the', 'cuba', 'denmark', 'egypt', 'estonia', 'finland', 'france', 'germany', 'ghana', 'greece', 'guinea', 'guinea', 'bissau', 'hungary', 'india', 'iran', 'iraq', 'israel', 'italy', 'japan', 'kenya', 'korea, north', 'korea, south', 'laos', 'latvia', 'lebanon', 'libya', 'lithuania', 'luxembourg', 'mexico', 'mongolia', 'morocco', 'mozambique', 'netherlands', 'norway', 'pakistan', 'panama', 'papua new guinea', 'philippines', 'poland', 'romania', 'russia', 'rwanda', 'seychelles', 'singapore', 'solomon islands', 'south africa', 'spain', 'sweden', 'thailand', 'ukraine', 'united kingdom', 'united states', 'vietnam']


In [14]:
print("Countries not found in text:", not_found_countries)

Countries not found in text: ['andorra', 'antigua and barbuda', 'argentina', 'armenia', 'azerbaijan', 'bahamas, the', 'bahrain', 'barbados', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia and herzegovina', 'botswana', 'brazil', 'brunei', 'burkina faso', 'burundi', 'cameroon', 'central african republic', 'chad', 'chile', 'colombia', 'comoros', 'congo, republic of the', 'costa rica', 'croatia', 'cyprus', 'czech republic', 'djibouti', 'dominica', 'dominican republic', 'east timor', 'ecuador', 'el salvador', 'equatorial guinea', 'eritrea', 'eswatini', 'ethiopia', 'fiji', 'gabon', 'gambia, the', 'georgia', 'grenada', 'guatemala', 'guyana', 'haiti', 'honduras', 'iceland', 'indonesia', 'ireland', 'ivory coast', 'jamaica', 'jordan', 'kazakhstan', 'kiribati', 'kuwait', 'kyrgyzstan', 'lesotho', 'liberia', 'liechtenstein', 'madagascar', 'malawi', 'malaysia', 'maldives', 'mali', 'malta', 'marshall islands', 'mauritania', 'mauritius', 'micronesia, federated states of', 'moldova', 'monaco', 'monten

#### Found which countries from the list are and are not in the text. Created aliases for some countries that I verified existed in the text but were not matching. 

### 5.	Use the text file to create a NER object.

In [15]:
book = NER(cleaned) # already lowercase and without special characters

### 6.	Split the sentence entities from the NER object.

In [16]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [17]:
df_sentences.head()

Unnamed: 0,sentence,entities
0,"(the, 20th, century, changed, the, world, in, ...",[the 20th century]
1,"(the, world, wars, sparked, tension, between, ...",[]
2,"(these, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(historic, events, in, the, 20th, century, edi...","[the 20th century, the beginning of the centur..."
4,"(the, 1900s, saw, the, decade, herald, a, seri...","[the 1900s, the decade]"


In [18]:
country_df.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [19]:
countries_df = country_df.drop(columns=["Unnamed: 0"]) #remove Unnamed

In [20]:
countries_df['country_name'] = countries_df['country_name'].str.lower()

In [21]:
countries_df.head()

Unnamed: 0,country_name
0,afghanistan
1,albania
2,algeria
3,andorra
4,angola


In [22]:
country_aliases = {
    'korea, north': 'north korea',
    'korea, south': 'south korea',
    "china, people's republic of": 'china',
    'congo, democratic republic of the': 'congo',
    # add more as needed
}

In [23]:
# Map aliases: default to original name if not in aliases
countries_df['country_alias'] = countries_df['country_name'].map(country_aliases)

In [24]:
countries_df.head()

Unnamed: 0,country_name,country_alias
0,afghanistan,
1,albania,
2,algeria,
3,andorra,
4,angola,


In [25]:
countries_df[countries_df['country_name'].str.contains("korea", case=False, na=False)]

Unnamed: 0,country_name,country_alias
91,"korea, north",
92,"korea, south",


In [26]:
countries_df['country_name'] = countries_df['country_name'].str.strip()

In [27]:
print(countries_df.loc[91, 'country_name'])  # should be 'korea, north'
print(countries_df.loc[92, 'country_name'])  # should be 'korea, south'

korea, north
korea, south


In [28]:
countries_df['country_alias'] = countries_df['country_name'].map(country_aliases)

In [29]:
countries_df[countries_df['country_name'].str.contains("korea", case=False, na=False)]

Unnamed: 0,country_name,country_alias
91,"korea, north",north korea
92,"korea, south",south korea


In [30]:
countries_df[countries_df['country_name'].str.contains("china", case=False, na=False)]

Unnamed: 0,country_name,country_alias
35,"china, people's republic of",china


In [31]:
countries_df[countries_df['country_name'].str.contains("congo", case=False, na=False)]

Unnamed: 0,country_name,country_alias
38,"congo, democratic republic of the",congo
39,"congo, republic of the",


### 7.	Filter the entities so that you end up only with the ones from your countries list

In [32]:
# Function to filter out entities not of interest

def filter_entity(ent_list, countries_df):
    valid_names = set(countries_df['country_name']).union(
        set(countries_df['country_alias'].dropna())
    )
    return [ent for ent in ent_list if ent in valid_names]

In [33]:
# Check

filter_entity(["germany", "germans", "20"], countries_df)

['germany']

In [34]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))

In [35]:
df_sentences['country_entities'].head(20)

0                    []
1                    []
2                    []
3                    []
4                    []
5                    []
6                    []
7                    []
8                    []
9                    []
10                   []
11             [france]
12             [russia]
13    [germany, russia]
14    [russia, germany]
15                   []
16            [germany]
17                   []
18                   []
19                   []
Name: country_entities, dtype: object

In [36]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [37]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1164,"(colonial, cartographies, ,, postcolonial, bor...","[colonial cartographies, afghanistan, pakistan]","[afghanistan, pakistan]"
1196,"(the, moldovans, romania, ,, russia, ,, and, t...","[moldovans, russia]",[russia]
1208,"(now, ,, north, korea, may, be, the, one, true...","[north korea, one]",[north korea]
1249,"(selling, ', operation, passage, to, freedom, ...","[thomas dooley, american, vietnam]",[vietnam]
1252,"(military, pressures, against, north, vietnam,...","[vietnam, february 1964, january 1965]",[vietnam]
1266,"(nixon, prolonged, vietnam, war, for, politica...","[nixon, vietnam, johnson]",[vietnam]
1275,"(stuck, in, endless, preliminaries, vietnam, a...","[vietnam, paris, november 1968, january 1969]",[vietnam]
1499,"(anti, -, american, behavior, in, the, middle,...","[anti-american, the middle east, lebanon]",[lebanon]
1504,"(the, rise, of, china, and, india, a, new, asi...","[china, india, asian]","[china, india]"
1505,"(singapore, world, scientific, .)",[singapore],[singapore]


### 8.	Create the relationships dataframe

In [38]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [39]:
relationship_df = pd.DataFrame(relationships)

In [40]:
relationship_df

Unnamed: 0,source,target
0,france,russia
1,france,russia
2,russia,germany
3,germany,russia
4,france,russia
...,...,...
629,india,singapore
630,china,india
631,india,singapore
632,china,india


In [41]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,france,russia
1,france,russia
2,germany,russia
3,germany,russia
4,france,russia


In [42]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [43]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,france,russia,11
1,germany,russia,22
2,germany,italy,28
3,austria,germany,10
4,france,spain,1
5,france,poland,14
6,france,germany,19
7,germany,poland,23
8,estonia,germany,5
9,estonia,latvia,12


### 9.	Save and export your dataframe.

In [45]:
relationship_df.to_csv("country_relationships.csv", index=False)