In [4]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [5]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/12.8 MB 6.0 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 6.7 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 6.8 MB/s eta 0:00:02
     ------------------- -------------------- 6.3/12.8 MB 6.8 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 6.6 MB/s eta 0:00:01
     -------------------------- ------------- 8.4/12.8 MB 6.3 MB/s eta 0:00:01
     ------------------------------ --------- 9.7/12.8 MB 6.3 MB/s eta 0:00:01
     ---------------------------------- ----- 11.0/12.8 MB 6.2 MB/s eta 0:00:01
     ---------------------------------------  1

In [6]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Load 20th Century file

In [8]:
# Load the book

with open('20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [9]:
book = NER(data)

In [10]:
# Visualize identified entities

displacy.render(book[273:500], style = "ent", jupyter = True)

## Get named entity list per sentence

In [12]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [13]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Contribute,..."
1,"(accountLog, inPersonal, toolsDonate, Create, ...","[20th, the 20th century]"
2,"(World, War, I, (, 1914â€“1918)1.1.2Russian, R...",[World War I]
3,"(depression1.2.2The, rise, of, dictatorship1.3...","[World War II, Pacific1.3.7.1Background1.3.8Ja..."
4,"(begins1.4The, post, -, war, world1.4.1The, en...","[Cold War, 1947â€“1991)1.4.3War, the Cold War1..."
5,"(What, links, hereRelated, changesUpload, file...","[pageGet, URLDownload, Download, Wikipedia, en..."
6,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
7,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
8,"(Historic, events, in, the, 20th, century[edit...",[the 20th]
9,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]"


## Load Country names

In [86]:
# Import characters

countries_df = pd.read_csv("countries_list_20th_century_1.5.csv")

In [88]:
countries_df.head()

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


## Filtering entities from the text

In [91]:
# Function to filter out entities not of interest

def filter_entity(ent_list, countries_df):
    return [ent for ent in ent_list 
            if ent in list(countries_df['country_name'])]

In [93]:
print(countries_df.columns)


Index(['country_name'], dtype='object')


In [95]:
df_sentences['country_entities']=df_sentences['entities'].apply(lambda x: filter_entity(x,countries_df))

In [111]:
countries_df

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
...,...
204,Sahrawi Arab Democratic Republic
205,Somaliland
206,South Ossetia
207,Taiwan


In [115]:
df_sentences.head(10)

Unnamed: 0,sentence,entities,country_entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Contribute,...",[]
1,"(accountLog, inPersonal, toolsDonate, Create, ...","[20th, the 20th century]",[]
2,"(World, War, I, (, 1914â€“1918)1.1.2Russian, R...",[World War I],[]
3,"(depression1.2.2The, rise, of, dictatorship1.3...","[World War II, Pacific1.3.7.1Background1.3.8Ja...",[]
4,"(begins1.4The, post, -, war, world1.4.1The, en...","[Cold War, 1947â€“1991)1.4.3War, the Cold War1...",[]
5,"(What, links, hereRelated, changesUpload, file...","[pageGet, URLDownload, Download, Wikipedia, en...",[]
6,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]",[]
7,"(These, advancements, have, played, a, signifi...","[the 21st century, today]",[]
8,"(Historic, events, in, the, 20th, century[edit...",[the 20th],[]
9,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]",[]


In [117]:
# Check

filter_entity(['Afghanistan'], countries_df)

['Afghanistan']

In [119]:
df_sentences_filter=df_sentences[df_sentences['country_entities'].map(len) > 0]

In [121]:
df_sentences_filter.head()

Unnamed: 0,sentence,entities,country_entities
15,"(After, a, period, of, diplomatic, and, milita...","[the July Crisis, the end of July 1914, Britis...","[France, Russia]"
16,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[the Treaty of Brest-Litovsk, Germany, Russia]","[Germany, Russia]"
17,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
18,"(It, also, recognized, the, independence, of, ...","[Germany, Allied, American]",[Germany]
24,"(Many, Germans, felt, these, reparations, were...","[Germans, Germany, Allied, Kaiser, Europe]",[Germany]


In [123]:
print(countries_df.columns)


Index(['country_name'], dtype='object')


## Create Relationships of Countries

In [136]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filter.index[-1]):
    end_i = min(i+5, df_sentences_filter.index[-1])
    country_list = sum((df_sentences_filter.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [159]:
relationship_df = pd.DataFrame(relationships)

In [161]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Russia
4,France,Russia
...,...,...
677,India,Singapore
678,India,Singapore
679,India,Singapore
680,India,Singapore


In [163]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Germany,Russia
4,France,Russia


In [148]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [151]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,21
2,Germany,Italy,26
3,Austria,Germany,11
4,Germany,Spain,2
5,France,Spain,1
6,France,Poland,11
7,France,Germany,30
8,Germany,Poland,29
9,Estonia,Germany,5


In [167]:
relationship_df.to_csv('country_relationship.csv',index=False)