In [1]:
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
# Initialize the model
NER = spacy.load("en_core_web_sm")

# Test the model
doc = NER("This is a test sentence.")
for token in doc:
    print(token.text, token.pos_)

This PRON
is AUX
a DET
test NOUN
sentence NOUN
. PUNCT


In [6]:
# Open and read a text file
file_path = r'/Users/renubalaji/Documents/GitHubProjects/20th-century/key_events_20th_century.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Display the first few lines of the text
print(text[:500])  # Display the first 500 characters

Contents
Historic events in the 20th century
World at the beginning of the century
Between the wars
Global war: World War II (1939–1945)
The post-war world
The world at the end of the century
See also
References
Sources
External links

The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. Thes


# Creating a NER object

In [7]:
book = NER(text)

In [9]:
displacy.render(book[273:20000], style = "ent", jupyter = True)

# Splitting the sentence entities

In [10]:
df_sentences = []

In [12]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2024.2


In [13]:
import pandas as pd
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence":sent, "entities":entity_list})
df_sentences = pd.DataFrame(df_sentences)

In [14]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Contents, \n, Historic, events, in, the, 20th...","[the 20th century, the beginning of the centur..."
1,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race, the World Wide ..."
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(The, new, beginning, of, the, 20th, century, ...",[the 20th century]
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
5,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"
6,"(The, First, World, War, (, or, simply, WWI, )...","[The First World War, The Great War, 1914, 1918]"
7,"(The, war, and, by, extension, the, century, a...","[the century, Sarajevo, the Austro-Hungarian E..."
8,"(This, was, similar, to, how, the, 9/11, was, ...",[]
9,"(century.[1, ])",[]


# Filter data using the Main characters

In [15]:
def filter_entity(ent_list, character_df):
    # Your function logic here
    filtered_entities = [ent for ent in ent_list if ent in character_df['character'].values]
    return filtered_entities

In [16]:
print(df_sentences.columns)

Index(['sentence', 'entities'], dtype='object')


In [17]:
def filter_entity(ent_list, character_df):
    """
    Filters the entities from ent_list based on whether they appear in character_df['character'].

    Args:
        ent_list (list): A list of entities to filter.
        character_df (pd.DataFrame): A DataFrame containing a 'character' column with valid entities.

    Returns:
        list: A list of entities that match with those in the character DataFrame.
    """
    # Filter entities that exist in the character DataFrame
    filtered_entities = [ent for ent in ent_list if ent in character_df['character'].values]
    return filtered_entities

# Example usage:

# Assuming ent_list contains the extracted entities (strings)
ent_list = ["Germany", "France", "Einstein", "Russia", "Japan"]

# Assuming character_df is a DataFrame containing valid characters (including countries)
data = {'character': ["Germany", "Russia", "France", "Japan", "Italy"]}
character_df = pd.DataFrame(data)

# Filter the entities
filtered_result = filter_entity(ent_list, character_df)
print(filtered_result)

['Germany', 'France', 'Russia', 'Japan']


In [19]:
path = os.path.join(os.path.expanduser("~"), "relationships.csv")

# Try saving the file with error handling
try:
    df_sentences.to_csv(path, index=False)
    print(f"DataFrame created and saved at {path}.")
except Exception as e:
    print(f"Error saving the file: {e}")

DataFrame created and saved at /Users/renubalaji/relationships.csv.


# Create the relationships dataframe

In [21]:
countries_df = pd.read_csv("countries_20th_century.csv", index_col = 0)

In [22]:
countries_df.head()

Unnamed: 0,country_name
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


In [23]:
countries_df.dtypes

country_name    object
dtype: object

In [24]:
countries_df['country_name'].to_list

<bound method IndexOpsMixin.tolist of 1                              Afghanistan 
2                                  Albania 
3                                  Algeria 
4                                  Andorra 
5                                   Angola 
                       ...                 
205       Sahrawi Arab Democratic Republic 
206                             Somaliland 
207                          South Ossetia 
208                                 Taiwan 
209                            Transnistria
Name: country_name, Length: 209, dtype: object>

In [25]:
countries_df['country_name'] = countries_df['country_name'].str.strip().str.lower()

In [26]:
countries_df['country_name'].to_list

<bound method IndexOpsMixin.tolist of 1                           afghanistan
2                               albania
3                               algeria
4                               andorra
5                                angola
                     ...               
205    sahrawi arab democratic republic
206                          somaliland
207                       south ossetia
208                              taiwan
209                        transnistria
Name: country_name, Length: 209, dtype: object>

In [31]:
def filter_entity(ent_list, countries_df):
    cleaned_countries = countries_df['country_name'].to_list()
    return [ent.strip().lower() for ent in ent_list if ent.strip().lower() in cleaned_countries]

In [32]:
df_sentences['character_entity'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))

In [33]:
df_sentences.head()

Unnamed: 0,sentence,entities,character_entity
0,"(Contents, \n, Historic, events, in, the, 20th...","[the 20th century, the beginning of the centur...",[]
1,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race, the World Wide ...",[]
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]",[]
3,"(The, new, beginning, of, the, 20th, century, ...",[the 20th century],[]
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]",[]


In [34]:
def x(x):
    if len(x) > 0:
        return x

In [35]:
df_sentences_filtered = df_sentences[df_sentences['character_entity'].apply (lambda x: len(x) > 0)]

In [36]:
df_sentences_filtered

Unnamed: 0,sentence,entities,character_entity
12,"(The, Allies, ,, known, initially, as, "", The,...","[The Triple Entente, the British Empire, Franc...","[france, russia]"
13,"(Germany, ,, Austria, -, Hungary, ,, Bulgaria,...","[Germany, Austria-Hungary, Bulgaria, the Ottom...","[germany, bulgaria]"
14,"(In, 1917, ,, Russia, ended, hostile, actions,...","[1917, Russia, the Central Powers, Tsar]",[russia]
15,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[Bolsheviks, the Treaty of Brest-Litovsk, Germ...","[germany, russia]"
16,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[germany]
...,...,...,...
263,"(This, was, obviously, disquieting, to, the, U...","[the United States, Cuba]",[cuba]
265,"(After, a, tense, week, ,, the, Soviet, Union,...","[a tense week, the Soviet Union, Cold War, the...",[united states]
270,"(In, the, 1990s, ,, work, on, the, Internation...","[the 1990s, the International Space Station, t...","[russia, japan]"
274,"(Boris, Yeltsin, ,, president, of, Russia, ,, ...","[Boris Yeltsin, Russia]",[russia]
