### Importing libraries (NLP and Network analysis)

In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [3]:
# Download English module
!python -m spacy download en_core_web_sm > /dev/null 2>&1

In [4]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

In [5]:
# Load the text file
with open('Key_events_20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', ' ').strip()

In [6]:
data = re.sub(r"[‘’“”'\"`]|(\[edit\])", " ", data)

In [7]:
book = NER(data)

In [8]:
# Visualize identified entities
displacy.render(book[273:600], style="ent", jupyter=True)

The result labels can be seen with different values - "DATE", "EVENT", "GPR"(Geopolitical Entity),"ORG"(Organization)etc. At first glance, the algorithm seems to perform adequately but there are instances where it misclassifies terms like "The Assassination of Archduke" as "Work of art" which indicates room for improvement in the algorithm's accuracy.

In [11]:
## Get named entity list per sentence
df_sentences = [] # empty shell to store results
# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

In [12]:
df_sentences = pd.DataFrame(df_sentences)

In [13]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,Key events of the 20th century - Wikipedia ...,[the 20th century - Wikipedia ...
1,The war to end all wars : World War I (1914–19...,"[World War I, 1.1.2The, Russian Revolution and..."
2,Overlord 1.3.6Final days 1.3.7Th...,"[1.3.6Final days, the Pacific , 1.3.7.1Ba..."
3,1.3.11The Holocaust 1.3.12The Nuclear A...,"[1.4.1The, Cold War, 1.4.3War, the Cold War ..."
4,1.5The world at the end of the century,[the end of the century]
5,2See also 3References 4Sources ...,"[2See, 5External, the 20th century 2, Artic..."
6,The World Wars sparked tension between countri...,"[the Cold War, the Space Race, the World Wide ..."
7,These advancements have played a significant r...,"[the 21st century, today]"
8,Events in the 20th century The world at the b...,"[the 20th century, the beginning of the centur..."
9,The 1900s saw the decade herald a series of in...,"[The 1900s, the decade]"


In [14]:
# Import countries
country_df = pd.read_csv("countries_list.csv", index_col=0)

In [15]:
country_df['country_name'] = country_df['country_name'].str.strip()

In [16]:
country_df.head()

Unnamed: 0,country_name
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


In [17]:
# Function to filter out entities not of interest
def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list if ent in list(country_df['country_name'])]

In [18]:
# Check
filter_entity(["Afghanistan", "CF", "2"], country_df)

['Afghanistan']

In [19]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [20]:
print(df_sentences.head(10))

                                            sentence  \
0  Key events of the 20th century - Wikipedia    ...   
1  The war to end all wars : World War I (1914–19...   
2  Overlord        1.3.6Final days        1.3.7Th...   
3  1.3.11The Holocaust        1.3.12The Nuclear A...   
4   1.5The world at the end of the century             
5  2See also        3References        4Sources  ...   
6  The World Wars sparked tension between countri...   
7  These advancements have played a significant r...   
8  Events in the 20th century  The world at the b...   
9  The 1900s saw the decade herald a series of in...   

                                            entities country_entities  
0  [the 20th century - Wikipedia                 ...               []  
1  [World War I, 1.1.2The, Russian Revolution and...               []  
2  [1.3.6Final days, the Pacific      , 1.3.7.1Ba...               []  
3  [1.4.1The, Cold War, 1.4.3War, the Cold War   ...               []  
4                      

In [21]:
# Filter out sentences that don't have any country entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [22]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1184,The division of Germany - The Cold War (1945–1...,"[Germany, The Cold War, 1945–1989]",[Germany]
1216,The forgotten violence that helped India break...,[India],[India]
1221,Indian Independence Day: everything you need t...,"[Indian, Partition, India, Pakistan, 70 years]","[India, Pakistan]"
1231,"The Philippines, 1898–1946 | US House of Repre...","[Philippines, 1898–1946, US House of Represent...",[Philippines]
1303,"The Moldovans: Romania, Russia, and the Politi...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1368,Selling Operation Passage to Freedom : Dr. Th...,"[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1402,Stuck in Endless Preliminaries: Vietnam and th...,"[Stuck, Vietnam, the Battle of the Paris Peace...",[Vietnam]
1673,Anti-American Behavior in the Middle East: Evi...,"[Anti-American, the Middle East, Lebanon]",[Lebanon]
1678,The Rise of China and India: A New Asian Drama.,"[China, India, New Asian Drama]",[India]
1679,Singapore: World Scientific.,"[Singapore, World Scientific]",[Singapore]


## Create relationships

In [24]:
#Create relationships
relationships = []
window_size = 5  # Define the window size for looking at multiple sentences

for i in range(len(df_sentences_filtered) - 1):
    end_i = min(i + window_size, len(df_sentences_filtered))
    country_list = sum(df_sentences_filtered.iloc[i:end_i]['country_entities'], [])
    
    # Remove duplicated countries that are next to each other
    country_unique = [country_list[j] for j in range(len(country_list)) if (j == 0) or country_list[j] != country_list[j - 1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [25]:
relationship_df = pd.DataFrame(relationships)
relationship_df

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,Russia,Germany
3,Germany,Bulgaria
4,Bulgaria,Russia
...,...,...
651,Lebanon,India
652,India,Singapore
653,Lebanon,India
654,India,Singapore


In [26]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,Germany,Russia
3,Bulgaria,Germany
4,Bulgaria,Russia


In [28]:
# Summarize the interactions
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [32]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Italy,13
1,Italy,Russia,1
2,Germany,Russia,16
3,Bulgaria,Germany,2
4,Bulgaria,Russia,2
5,Germany,Italy,26
6,Austria,Germany,9
7,Germany,Spain,4
8,France,Spain,4
9,France,Poland,9


In [33]:
relationship_df.to_csv('country_relationship.csv')