In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------------------ -------- 10.0/12.8 MB 56.6 MB/s eta 0:00:01
     ------------------------------------ -- 12.1/12.8 MB 61.4 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 23.1 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 19.9 MB/s  0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

In [3]:
with open('20th_century_article_Wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

wiki = NER(data)

In [4]:
# Visualize identified entities

displacy.render(wiki[18000:20000], style = "ent", jupyter = True)

In [5]:
# Loop through sentences, get entity list for each sentence
wiki_hold = []
for sent in wiki.sents:
       entity_list = [ent.text for ent in sent.ents]
       wiki_hold.append({'sentence': sent, 'entities': entity_list})

df_sentences = pd.DataFrame(wiki_hold)

In [6]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Key, events, of, the, 20th, century, -,...",[the 20th century]
1,"(articleAbout, WikipediaContact, us, \t\t...",[Search Search ...
2,"(The, rise, of, dictatorship, , 1.4,...","[1.4, World War II, 1.4.1]"
3,"(The, war, in, Europe, , 1.4.2, Blitzk...","[Europe, 1.4.2, Blitzkrieg 1.4.3, Oper..."
4,"(Turning, tides, )",[]
5,"(1.4.5, Operation, Overlord, , 1.4.6, ...",[Operation Overlord 1.4.6 Final days ...
6,"(Allied, offensive, , 1.4.10, Final, d...",[Allied offensive 1.4.10 Final days ...
7,"(The, Holocaust, , 1.4.12, The, Nuclea...","[The Nuclear Age, 1.5]"
8,"(The, post, -, war, world, , 1.5.1)",[1.5.1]
9,"(The, end, of, empires, :, decolonization, ...","[The Cold War, 1947â€“1991]"


In [7]:
csv_filepath = r"C:\Users\stefa\20th-century\countries_list_20th_century_1.5.csv"

In [8]:
df = pd.read_csv(csv_filepath)

In [9]:
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
...,...
204,Sahrawi Arab Democratic Republic
205,Somaliland
206,South Ossetia
207,Taiwan


In [10]:
#df['country_name'] = df['country_name'].str.replace()
#df['country_name'] = df['country_name'].str.replace(r'\s+', '', regex=True)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df_new = df
print(df_new)

                         country_name
0                         Afghanistan
1                             Albania
2                             Algeria
3                             Andorra
4                              Angola
..                                ...
204  Sahrawi Arab Democratic Republic
205                        Somaliland
206                     South Ossetia
207                            Taiwan
208                      Transnistria

[209 rows x 1 columns]


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [11]:
def filter_entity(ent_list, df_new):
       return [ent for ent in ent_list
                  if ent in list(df_new['country_name'])]

In [12]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_new))

In [13]:
# Filter out sentences that don’t have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1117,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1122,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
1131,"(^, "", The, Philippines, ,, 1898â€“1946, |, US...","[Philippines, 1898â€“1946, US House of Represe...",[Philippines]
1161,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
1199,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1263,"("", Selling, ', Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1294,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1569,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1575,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1576,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


In [14]:
relationships = []  # create an empty list

# Iterate over the DataFrame, but stop before the last index
for i in range(len(df_sentences_filtered) - 1):
    end_i = min(i + 5, len(df_sentences_filtered) - 1)
    
    country_list = []
    # Loop over the relevant rows and extend the country_list
    for j in range(i, end_i + 1):
        country_list.extend(df_sentences_filtered.iloc[j].country_entities)
    
    # Remove consecutive duplicates
    country_unique = [country_list[k] for k in range(len(country_list)) if (k == 0) or country_list[k] != country_list[k - 1]]

    # Generate relationships if there are multiple unique countries
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [15]:
relationship_df = pd.DataFrame(relationships)

relationship_df

Unnamed: 0,source,target
0,France,Austria
1,Austria,Russia
2,Russia,Germany
3,Germany,Russia
4,Russia,Germany
...,...,...
769,Lebanon,India
770,India,Singapore
771,Lebanon,India
772,India,Singapore


In [16]:
relationship_df.to_csv('country_relationship.csv') 