In [1]:
# importing Libraries

import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 8.4 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 8.0 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 8.1 MB/s eta 0:00:01
     --------------------- ------------------ 6.8/12.8 MB 8.7 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 8.9 MB/s eta 0:00:01
     --------------------------------- ------ 10.7/12.8 MB 8.9 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 8.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_we

In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Use the text file to create a NER object

In [4]:
# Load the path by using a raw string
path = r'C:\Users\okumb\Downloads\20th-century\.venv\Scripts\key_events_20th_century.txt'

# Open the file using 'with' to ensure it gets closed after reading
with open(path, 'r', errors='ignore') as file: 
    data = file.read().replace('\n', ' ')

In [5]:
book = NER(data)

In [6]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

## Split the sentence entities from the NER object.

In [7]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [8]:
# View the Dataframe
df_sentences.head()

Unnamed: 0,sentence,entities
0,"( , Key, events, of, the, 20th, century, -, W...",[the 20th century - Wikipedia ...
1,"(Color, (, beta)AutomaticLightDarkReport, an, ...",[]
2,"(Create, account, , Log, in, , Person...",[Log in Personal]
3,"(1, Historic, events, in, the, 20th, century, ...","[the 20th century, the 20th century, 1.1, the ..."
4,"(1.1.2, The, Russian, Revolution, and, Communi...","[1.1.2, The Russian Revolution and Communism ..."


## Filter the entities with the ones from your countries list.

In [9]:
# Import Countries dataframe
path = r"C:\Users\okumb\Downloads\countries_list_20th_century_1.5.csv"

# Extract the CSV file as a Dataframe
df_country = pd.read_csv(path)

In [10]:
df_country.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [21]:
# deleting 'Unnamed: 0' column in the df_country dataframe

df_country.drop('Unnamed: 0', axis=1, inplace=True)

In [52]:
df_country.head()

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [57]:
def filter_entity(ent_list, country_series):
    country_set = set(country_series.str.lower().str.strip())  # Normalize country names
    matched_entities = []
    
    for ent in ent_list:
        ent_lower = ent.lower().strip()  # Normalize entity
        print(f"Checking entity: '{ent_lower}'")
        for country in country_set:
            if country in ent_lower:
                print(f"Matched '{ent_lower}' with country '{country}'")
                matched_entities.append(ent)
                break  # Once matched, stop further checks for this entity
    
    return matched_entities


In [62]:
# Apply the function
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_country['country_name']))


Checking entity: 'the 20th century - wikipedia                            jump'
Checking entity: 'navigation 	   main'
Checking entity: 'contribute 	   helplearn'
Checking entity: 'log in         personal'
Checking entity: 'the 20th century'
Checking entity: 'the 20th century'
Checking entity: '1.1'
Checking entity: 'the beginning of the century'
Checking entity: 'world war i'
Checking entity: '1.1.2'
Checking entity: 'the russian revolution and communism'
Matched 'the russian revolution and communism' with country 'russia'
Checking entity: '1.2.1'
Checking entity: 'world war ii'
Checking entity: '1.3.1'
Checking entity: 'europe'
Checking entity: 'blitzkrieg         1.3.3'
Checking entity: 'the pacific       1.3.7.1 background           1.3.8 japanese expansion'
Matched 'the pacific       1.3.7.1 background           1.3.8 japanese expansion' with country 'japan'
Checking entity: '1.3.9'
Checking entity: 'the holocaust         1.3.12'
Checking entity: '1.4'
Checking entity: 'the cold w

In [63]:
df_sentences['character_entities'].head(20)

0                                                    []
1                                                    []
2                                                    []
3                                                    []
4     [The Russian Revolution and Communism           ]
5                                                    []
6                                                    []
7     [the Pacific       1.3.7.1 Background         ...
8                                                    []
9                                                    []
10                                                   []
11                                                   []
12                                                   []
13                                                   []
14                                                   []
15                                                   []
16                                                   []
17                                              

In [64]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

In [65]:
df_sentences_filtered.tail()

Unnamed: 0,sentence,entities,character_entities
1376,"("", Cuban, Missile, Crisis, :, For, thirteen, ...","[Cuban Missile Crisis: For, thirteen days, Oct...","[Cuban Missile Crisis: For, the Cuban Missile ..."
1380,"("", Nuclear, Close, Calls, :, The, Cuban, Miss...",[Nuclear Close Calls: The Cuban Missile Crisis],[Nuclear Close Calls: The Cuban Missile Crisis]
1616,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, Lebanon]",[Lebanon]
1621,"(The, Rise, of, China, and, India, :, A, New, ...","[China, India, New Asian Drama]",[India]
1622,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


## Create the relationships dataframe

In [66]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [67]:
relationship_df = pd.DataFrame(relationships)

In [68]:
relationship_df.head()

Unnamed: 0,source,target
0,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...
1,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...
2,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...
3,Serbian,Russians
4,Serbian,Russians


In [69]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...
1,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...
2,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...
3,Russians,Serbian
4,Russians,Serbian


In [70]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [71]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,The Russian Revolution and Communism,the Pacific 1.3.7.1 Background ...,3
1,Russians,Serbian,6
2,France,Russians,4
3,France,Russia,6
4,Germany,Russia,21
5,Austria-Hungary,Germany,6
6,Austria-Hungary,Bulgaria,6
7,Bulgaria,the Ottoman Empire,6
8,Russia,the Ottoman Empire,5
9,Bolshevik Russia,Russia,5


In [72]:
relationship_df.to_csv('country_relationship.csv')