In [30]:
import pandas as pd
import spacy
import networkx as nx
import re

# Load the NLP model
nlp = spacy.load("en_core_web_sm")

In [31]:
import re

# Load your text file
file_path = 'text_mining/key_events_20th_century.txt' 

with open(file_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()

# 1. Replace newlines with spaces
clean_text = raw_text.replace('\n', ' ')

# 2. Remove Wikipedia artifacts: [edit] and citations like [1], [12]
clean_text = re.sub(r'\[edit\]', '', clean_text)
clean_text = re.sub(r'\[\d+\]', '', clean_text)
# This removes anything inside square brackets like [3], [edit], or [14]
clean_text = re.sub(r'\[.*?\]', '', clean_text)

# This removes extra spaces left behind by the removal
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

print(clean_text[:500]) # Check the first 500 characters

# 3. Save the corrected text as a .txt file (Rubric Requirement)
with open('cleaned_text_20th_century.txt', 'w', encoding='utf-8') as f:
    f.write(clean_text)

print("Text wrangled and saved to cleaned_text_20th_century.txt")

The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today. Historic events in the 20th century World at the beginning of the century Main article: Edwardian era Map of colonial an
Text wrangled and saved to cleaned_text_20th_century.txt


### Data Wrangling & Observations

**1. Text Wrangling Requirements:**
* **Special Characters & Noise:** The raw Wikipedia text contains many "noise" elements like `[edit]` tags and numerical citations (e.g., `[1]`, `[14]`). These can confuse the NLP model when it tries to determine sentence boundaries. I have implemented a Regex cleaning step to remove these.
* **Line Breaks:** Newline characters (`\n`) have been replaced with spaces to ensure SpaCy processes the narrative as a continuous flow.

**2. Country List Evaluation:**
* **Name Mismatches:** The provided `countries_lookup.csv` contains modern country names. However, 20th-century history is dominated by entities that no longer exist or were referred to differently, such as the **Soviet Union (USSR)**, **Yugoslavia**, and **Czechoslovakia**. 
* **Correction Step:** To ensure accurate relationship mining, I have manually appended these historical aliases to the target list loaded from the CSV.

**3. Output:**
* The fully wrangled text is saved as `cleaned_text_20th_century.txt` as a requirement for the downstream NER process.

In [25]:
doc = nlp(clean_text)
sent_entity_df = []

for sent in doc.sents:
    # Extract entities (keeping GPE for countries)
    entities = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent.text, "entities": entities})

df_sentences = pd.DataFrame(sent_entity_df)
df_sentences.head()

Unnamed: 0,sentence,entities
0,The 20th century changed the world in unpreced...,[The 20th century]
1,The World Wars sparked tension between countri...,"[The World Wars, the Cold War, the Space Race,..."
2,These advancements have played a significant r...,"[the 21st century, today]"
3,Historic events in the 20th century World at t...,"[the 20th century, the beginning of the centur..."
4,The new beginning of the 20th century marked s...,[the 20th century]


In [26]:
# 1. Load the target countries from your CSV file
countries_df = pd.read_csv('text_mining/countries_lookup.csv')

# 2. Extract from 'country' column and add historical aliases
target_countries = countries_df['country'].unique().tolist()
historical_aliases = ["Soviet Union", "USSR", "USA", "Great Britain", "Yugoslavia", "Czechoslovakia"]
target_countries.extend(historical_aliases)

# Ensure unique values and clean strings
target_countries = list(set([c.strip() for c in target_countries]))

# 3. Apply the filter using your existing function
def filter_entities(ent_list, target_list):
    return [ent for ent in ent_list if ent in target_list]

df_sentences['entities_filtered'] = df_sentences['entities'].apply(lambda x: filter_entities(x, target_countries))

# Remove rows where no target countries were mentioned
df_filtered = df_sentences[df_sentences['entities_filtered'].map(len) > 0].reset_index(drop=True)
df_filtered.head()

Unnamed: 0,sentence,entities,entities_filtered
0,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...","[France, Austria, Hungary]"
1,"In 1917, Russia ended hostile actions against ...","[1917, Russia, the Central Powers, Tsar]",[Russia]
2,The Bolsheviks negotiated the Treaty of Brest-...,"[Bolsheviks, Germany, Russia]","[Germany, Russia]"
3,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
4,It also recognized the independence of Ukraine.,[Ukraine],[Ukraine]


In [27]:
relationships = []

# Iterate through the filtered dataframe
for i in range(len(df_filtered)):
    # Define the window (Current sentence + 4 next sentences)
    end_index = min(i + 5, len(df_filtered))
    
    # Flatten the list of entities found in this 5-sentence window
    window_entities = sum(df_filtered.iloc[i:end_index]['entities_filtered'], [])
    
    # Remove duplicates within the same window (e.g., if 'Germany' is mentioned twice)
    unique_entities = list(set(window_entities))
    
    # Create combinations if there's more than one country in the window
    if len(unique_entities) > 1:
        for idx, a in enumerate(unique_entities[:-1]):
            for b in unique_entities[idx + 1:]:
                # Sort alphabetically so (USA, Russia) is the same as (Russia, USA)
                pair = sorted([a, b])
                relationships.append({"source": pair[0], "target": pair[1]})

# Create the raw relationship dataframe
df_rels_raw = pd.DataFrame(relationships)

In [28]:
# Group by source/target and count occurrences
relationship_df = df_rels_raw.groupby(['source', 'target']).size().reset_index()

# Rename columns for clarity
relationship_df.columns = ['source', 'target', 'value']

# Sort to see the most frequent interactions
relationship_df = relationship_df.sort_values(by='value', ascending=False)

relationship_df.head(10)

Unnamed: 0,source,target,value
277,France,Germany,32
319,Germany,Italy,31
320,Germany,Japan,29
332,Germany,Poland,22
152,China,Japan,19
454,Italy,Japan,15
215,Czechoslovakia,Yugoslavia,15
471,Japan,Russia,15
413,India,Pakistan,14
194,Czechoslovakia,Germany,14


In [22]:
# Exporting the final relationship dataframe to a CSV file
relationship_df.to_csv('country_relationships_20th_century.csv', index=False)

# Optional: Export the cleaned text as well
with open('cleaned_text_20th_century.txt', 'w', encoding='utf-8') as f:
    f.write(clean_text)

print("Final dataframe and cleaned text have been exported successfully.")

Final dataframe and cleaned text have been exported successfully.
