In [1]:
import pandas as pd
import spacy
import networkx as nx
import re

# Load the NLP model
nlp = spacy.load("en_core_web_sm")

In [2]:
# Load your text file
file_path = '/Users/sonaabraham/20th-century/text_mining/key_events_20th_century.txt' 

with open(file_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()


# Basic cleaning: Replace newlines with spaces
clean_text = raw_text.replace('\n', ' ')

# Quick check: Check for odd characters
special_chars = re.findall(r'[^a-zA-Z0-9\s.,]', clean_text)
print(f"First 10 special characters found: {special_chars[:10]}")

First 10 special characters found: ['-', "'", '[', ']', '[', ']', ':', '-', '"', '"']


Data Wrangling Observations

Newline Removal: The raw text contained multiple newline characters (\n) that would interfere with SpaCy's ability to recognize sentences across line breaks. I replaced these with spaces to maintain a continuous text flow.

Special Characters: Initial analysis revealed characters like [, ], and : (likely from Wikipedia-style citations or headings). While these don't break the NER process, identifying them helped confirm the text's structure.

Entity Consistency: I have targeted specific country names to ensure the network analysis remains focused on Geopolitical Entities (GPE) rather than dates or general events.

In [3]:
doc = nlp(clean_text)
sent_entity_df = []

for sent in doc.sents:
    # Extract entities (keeping GPE for countries)
    entities = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent.text, "entities": entities})

df_sentences = pd.DataFrame(sent_entity_df)
df_sentences.head()

Unnamed: 0,sentence,entities
0,The 20th century changed the world in unpreced...,[The 20th century]
1,The World Wars sparked tension between countri...,"[The World Wars, the Cold War, the Space Race,..."
2,These advancements have played a significant r...,"[the 21st century, today]"
3,Historic events in the 20th century[edit] Worl...,"[20th, 1914]"
4,The new beginning of the 20th century marked s...,[the 20th century]


In [4]:
# Define the list of countries you want to track
target_countries = ["Germany", "France", "United Kingdom", "Russia", "USA", "Japan", "Soviet Union", "China", "Italy"]

# Function to keep only the countries we care about
def filter_entities(ent_list, target_list):
    return [ent for ent in ent_list if ent in target_list]

# Apply the filter using a lambda function
df_sentences['entities_filtered'] = df_sentences['entities'].apply(lambda x: filter_entities(x, target_countries))

# Remove rows where no target countries were mentioned
df_filtered = df_sentences[df_sentences['entities_filtered'].map(len) > 0].reset_index(drop=True)

df_filtered.head()

Unnamed: 0,sentence,entities,entities_filtered
0,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...",[France]
1,"In 1917, Russia ended hostile actions against ...","[1917, Russia, the Central Powers, Tsar]",[Russia]
2,The Bolsheviks negotiated the Treaty of Brest-...,"[Bolsheviks, Germany, Russia]","[Germany, Russia]"
3,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
4,It also recognized the independence of Ukraine...,"[Germany, Allied, American, 1918.[4]",[Germany]


In [5]:
relationships = []

# Iterate through the filtered dataframe
for i in range(len(df_filtered)):
    # Define the window (Current sentence + 4 next sentences)
    end_index = min(i + 5, len(df_filtered))
    
    # Flatten the list of entities found in this 5-sentence window
    window_entities = sum(df_filtered.iloc[i:end_index]['entities_filtered'], [])
    
    # Remove duplicates within the same window (e.g., if 'Germany' is mentioned twice)
    unique_entities = list(set(window_entities))
    
    # Create combinations if there's more than one country in the window
    if len(unique_entities) > 1:
        for idx, a in enumerate(unique_entities[:-1]):
            for b in unique_entities[idx + 1:]:
                # Sort alphabetically so (USA, Russia) is the same as (Russia, USA)
                pair = sorted([a, b])
                relationships.append({"source": pair[0], "target": pair[1]})

# Create the raw relationship dataframe
df_rels_raw = pd.DataFrame(relationships)

In [6]:
# Group by source/target and count occurrences
relationship_df = df_rels_raw.groupby(['source', 'target']).size().reset_index()

# Rename columns for clarity
relationship_df.columns = ['source', 'target', 'value']

# Sort to see the most frequent interactions
relationship_df = relationship_df.sort_values(by='value', ascending=False)

relationship_df.head(10)

Unnamed: 0,source,target,value
7,France,Germany,33
13,Germany,Japan,31
12,Germany,Italy,28
3,China,Japan,23
14,Germany,Russia,19
19,Japan,Russia,18
1,China,Germany,18
8,France,Italy,13
17,Italy,Japan,12
4,China,Russia,11


In [7]:
# Exporting the final relationship dataframe to a CSV file
relationship_df.to_csv('country_relationships_20th_century.csv', index=False)

# Optional: Export the cleaned text as well
with open('cleaned_text_20th_century.txt', 'w', encoding='utf-8') as f:
    f.write(clean_text)

print("Final dataframe and cleaned text have been exported successfully.")

Final dataframe and cleaned text have been exported successfully.
