# 1.6 Intro to NLP and Network Analysis
## Table of Contents:
### 1. Installing New Libraries
### 2. Importing libraries
### 3. Load 20th Century text
### 4. Get named entity list per sentence
### 5. Load country names
### 6. Filtering entities from the text
### 7. Create relationships

### 1. Installing New Libraries

In [1]:
pip install spacy==3.4.3

Collecting spacy==3.4.3
  Downloading spacy-3.4.3-cp311-cp311-win_amd64.whl.metadata (24 kB)
Collecting spacy-legacy<3.1.0,>=3.0.10 (from spacy==3.4.3)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy==3.4.3)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy==3.4.3)
  Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy==3.4.3)
  Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy==3.4.3)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.2.0,>=8.1.0 (from spacy==3.4.3)
  Downloading thinc-8.1.12-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.1.0,>=0.9.1 (from spacy==3.4.3)
  Downloading wasabi-0.10.1-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (fr

In [2]:
pip install networkx==2.8.8

Collecting networkx==2.8.8
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
    --------------------------------------- 0.0/2.0 MB 660.6 kB/s eta 0:00:04
   ---- ----------------------------------- 0.2/2.0 MB 2.5 MB/s eta 0:00:01
   ------------------------ --------------- 1.2/2.0 MB 9.7 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 11.7 MB/s eta 0:00:00
Installing collected packages: networkx
Successfully installed networkx-2.8.8
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install scipy==1.9.3

Collecting scipy==1.9.3
  Downloading scipy-1.9.3-cp311-cp311-win_amd64.whl.metadata (58 kB)
     ---------------------------------------- 0.0/58.5 kB ? eta -:--:--
     ------- -------------------------------- 10.2/58.5 kB ? eta -:--:--
     ------------------- ------------------ 30.7/58.5 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 58.5/58.5 kB 442.5 kB/s eta 0:00:00
Collecting numpy<1.26.0,>=1.18.5 (from scipy==1.9.3)
  Downloading numpy-1.25.2-cp311-cp311-win_amd64.whl.metadata (5.7 kB)
Downloading scipy-1.9.3-cp311-cp311-win_amd64.whl (39.9 MB)
   ---------------------------------------- 0.0/39.9 MB ? eta -:--:--
   ---------------------------------------- 0.2/39.9 MB 5.6 MB/s eta 0:00:08
   - -------------------------------------- 1.1/39.9 MB 13.7 MB/s eta 0:00:03
   -- ------------------------------------- 2.1/39.9 MB 19.1 MB/s eta 0:00:02
   -- ------------------------------------- 2.1/39.9 MB 19.1 MB/s eta 0:00:02
   -- -------------------------------

  You can safely remove it manually.


### 2. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 640.0 kB/s eta 0:00:20
     --------------------------------------- 0.1/12.8 MB 469.7 kB/s eta 0:00:28
      --------------------------------------- 0.2/12.8 MB 1.1 MB/s eta 0:00:12
      --------------------------------------- 0.2/12.8 MB 1.0 MB/s eta 0:00:13
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.3/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.5/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 

In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

### 3. Load 20th Century text

In [55]:
# Load the text

with open('20th_Century_Wiki.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '').lower()

In [56]:
#Limit the text length

data_limited = data[:10000]

data_limited



### The text looks good, I dont see special charachters used and I know that the countries are the same as in the list I created

In [57]:
book = NER(data)

In [58]:
# Visualize identified entities

displacy.render(book[273:1000], style = "ent", jupyter = True)

### 4. Get named entity list per sentence

In [59]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [60]:
df_sentences.head(40)

Unnamed: 0,sentence,entities
0,"(key, events, of, the, 20th, century, -, wikip...","[the 20th century, 20th, the 20th century, wor..."
1,"(the, world, wars, sparked, tension, between, ...",[]
2,"(these, advancements, have, played, a, signifi...","[the 21st century, the 20th, the 20th century]"
3,"(the, 1900s, saw, the, decade, herald, a, seri...","[the 1900s, the decade, 1914 to 1918, first]"
4,"(""the, war, to, end, all, wars, "", :, world, w...","[sarajevo, archduke franz, 1914, 1918]"
5,"(the, war, and, by, extension, the, century, a...","[the century, sarajevo, franz ferdinand]"
6,"(this, was, similar, to, how, the, 9/11, was, ...","[9/11, serbian, russians]"
7,"(interwoven, alliances, ,, an, increasing, arm...","[europe, british, france, italy, russia]"
8,"(germany, ,, austria, -, hungary, ,, bulgaria,...","[germany, austria, hungary, bulgaria, powers""...."
9,"(the, bolsheviks, negotiated, the, treaty, of,...","[germany, russia]"


### 5. Load country names

In [61]:
# Import countries

df_countries = pd.read_csv("df_countries.csv", index_col = 0)

In [62]:
df_countries.head(10)

Unnamed: 0,Country
0,afghanistan
1,albania
2,algeria
3,andorra
4,angola
5,antigua and barbuda
6,argentina
7,armenia
8,australia
9,austria


### 6. Filtering entities from the text

In [63]:
# Function to filter out entities not of interest

def filter_entity(ent_list, df_countries):
    return [ent for ent in ent_list 
            if ent in list(df_countries['Country'])]

In [64]:
# Check

#filter_entity(["germany", "CF", "2"], df_countries)
filter_entity(["Germany", "monkey", "norway"], df_countries)

['norway']

In [65]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_countries))

In [66]:
df_sentences['country_entities']

0      []
1      []
2      []
3      []
4      []
       ..
984    []
985    []
986    []
987    []
988    []
Name: country_entities, Length: 989, dtype: object

In [67]:
# Filter out sentences that don't have any country entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [68]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
677,"("", indian, independence, day, :, everything, ...","[indian, india, pakistan, 70 years]","[india, pakistan]"
702,"("", colonial, cartographies, ,, postcolonial, ...",[afghanistan],[afghanistan]
725,"(the, moldovans, :, romania, ,, russia, ,, and...","[moldovans, romania, russia]","[romania, russia]"
764,"("", selling, "", operation, passage, to, freedo...","[thomas dooley, american, vietnam]",[vietnam]
766,"("", military, pressures, against, north, vietn...","[vietnam, february 1964-january 1965]",[vietnam]
777,"("", nixon, prolonged, vietnam, war, for, polit...","[nixon, vietnam]",[vietnam]
784,"("", stuck, in, endless, preliminaries, :, viet...","[vietnam, paris, november 1968-january 1969]",[vietnam]
940,"("", anti, -, american, behavior, in, the, midd...","[anti-american, the middle east, lebanon]",[lebanon]
945,"(the, rise, of, china, and, india, :, a, new, ...","[china, india, asian]",[india]
946,"(singapore, :, world, scientific, .)",[singapore],[singapore]


### 7. Create relationships

In [69]:
# Defining relationships 

# window size = 5 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [70]:
relationship_df = pd.DataFrame(relationships)

In [71]:
relationship_df

Unnamed: 0,source,target
0,france,italy
1,italy,russia
2,france,italy
3,italy,russia
4,russia,germany
...,...,...
598,india,singapore
599,india,singapore
600,india,singapore
601,india,singapore


In [72]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)

In [73]:
relationship_df.head(5)

Unnamed: 0,source,target
0,france,italy
1,italy,russia
2,france,italy
3,italy,russia
4,germany,russia


In [74]:
# Summarize the interactions in the new column "Value"

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [75]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,france,italy,13
1,italy,russia,6
2,germany,russia,27
3,austria,germany,17
4,austria,hungary,6
5,bulgaria,hungary,6
6,bulgaria,russia,6
7,germany,italy,33
8,germany,spain,4
9,france,spain,4


In [76]:
relationship_df.to_csv('20th_century_relationships.csv')