In [28]:
import os
import subprocess

# Define the required packages and their versions
required_packages = {
    "spacy": "3.4.3",
    "networkx": "2.8.8",
    "matplotlib": "3.5.3",
    "scipy": "1.9.3",
}

# List of packages to uninstall that may conflict
conflicting_packages = [
    "numpy",  # May have version conflicts with other libraries
    "pandas",  # Not required, but may be present and cause conflicts
]

# Function to uninstall packages
def uninstall_packages(packages):
    for package in packages:
        print(f"Uninstalling {package}...")
        subprocess.run(["pip", "uninstall", package, "-y"])

# Function to install required packages
def install_packages(packages):
    for package, version in packages.items():
        print(f"Installing {package}=={version}...")
        subprocess.run(["pip", "install", f"{package}=={version}"])

# Uninstall conflicting packages
uninstall_packages(conflicting_packages)

# Install required packages
install_packages(required_packages)

# Optionally, you can also check the installed versions
print("\nInstalled package versions:")
for package in required_packages.keys():
    result = subprocess.run(["pip", "show", package], capture_output=True, text=True)
    print(result.stdout)

Uninstalling numpy...
Uninstalling pandas...
Installing spacy==3.4.3...
Installing networkx==2.8.8...
Installing matplotlib==3.5.3...
Installing scipy==1.9.3...

Installed package versions:
Name: spacy
Version: 3.4.3
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: c:\users\chitr\anaconda3\lib\site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, pathy, preshed, pydantic, requests, setuptools, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi
Required-by: 

Name: networkx
Version: 2.8.8
Summary: Python package for creating and manipulating graphs and networks
Home-page: https://networkx.org/
Author: Aric Hagberg
Author-email: hagberg@lanl.gov
License: 
Location: c:\users\chitr\anaconda3\lib\site-packages
Requires: 
Required-by: 

Name: matplotlib
Version: 3.5.3
Summary: Python plotting package
Home-page: https:

In [2]:
import sys
import subprocess

# Define the required packages and their versions
required_packages = {
    "spacy": "3.4.3",
    "networkx": "2.8.8",
    "matplotlib": "3.5.3",
    "scipy": "1.9.3",
}

# List of packages to uninstall that may conflict
conflicting_packages = [
    "backports",
    "numpy",
    "pandas",  # Not required, but may be present and cause conflicts
]

# Function to uninstall packages
def uninstall_packages(packages):
    for package in packages:
        print(f"Uninstalling {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", package, "-y"])

# Function to install required packages
def install_packages(packages):
    for package, version in packages.items():
        print(f"Installing {package}=={version}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}=={version}"])

# Uninstall conflicting packages
uninstall_packages(conflicting_packages)

# Install backports package to fix import error
subprocess.check_call([sys.executable, "-m", "pip", "install", "backports.tarfile"])

# Install required packages
install_packages(required_packages)

# Optionally, you can also check the installed versions
print("\nInstalled package versions:")
for package in required_packages.keys():
    result = subprocess.check_output([sys.executable, "-m", "pip", "show", package]).decode('utf-8')
    print(result)


Uninstalling backports...
Uninstalling numpy...
Uninstalling pandas...
Installing spacy==3.4.3...
Installing networkx==2.8.8...
Installing matplotlib==3.5.3...
Installing scipy==1.9.3...

Installed package versions:
Name: spacy
Version: 3.4.3
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: c:\users\chitr\anaconda3\lib\site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, pathy, preshed, pydantic, requests, setuptools, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi
Required-by: 

Name: networkx
Version: 2.8.8
Summary: Python package for creating and manipulating graphs and networks
Home-page: https://networkx.org/
Author: Aric Hagberg
Author-email: hagberg@lanl.gov
License: 
Location: c:\users\chitr\anaconda3\lib\site-packages
Requires: 
Required-by: 

Name: matplotlib
Version: 3.5.3
Summary: Python plotting

In [5]:
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

Matplotlib is building the font cache; this may take a moment.


In [15]:
# Install the spaCy model
!python -m spacy download en_core_web_sm

# Load the model
import spacy

# Initialize the model
NER = spacy.load("en_core_web_sm")

# Test the model
doc = NER("This is a test sentence.")
for token in doc:
    print(token.text, token.pos_)

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 1.4 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
This PRON
is AUX
a DET
test NOUN
sentence NOUN
. PUNCT


In [13]:
# Open and read a text file
file_path = r'C:\Users\chitr\Desktop\Renu_Data\20th_century.txt.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Display the first few lines of the text
print(text[:500])  # Display the first 500 characters

Contents
Historic events in the 20th century
World at the beginning of the century
Between the wars
Global war: World War II (1939–1945)
The post-war world
The world at the end of the century
See also
References
Sources
External links

The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. Thes


# Creating a NER object

In [17]:
book = NER(text)

In [63]:
displacy.render(book[273:20000], style = "ent", jupyter = True)

# Splitting the sentence entities

In [64]:
df_sentences = []

In [65]:
!pip install pandas



In [66]:
import pandas as pd
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence":sent, "entities":entity_list})
df_sentences = pd.DataFrame(df_sentences)

In [67]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Contents, \n, Historic, events, in, the, 20th...","[the 20th century, the beginning of the centur..."
1,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(The, new, beginning, of, the, 20th, century, ...",[the 20th century]
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
5,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"
6,"(The, First, World, War, (, or, simply, WWI, )...","[The First World War, WWI, The Great War, 1914..."
7,"(The, war, and, by, extension, the, century, a...","[the century, Erzherzog Franz Ferdinand, Gavri..."
8,"(This, was, similar, to, how, the, 9/11, was, ...","[9/11, Slavic, Serbian, Russians, Serbs]"
9,"(Interwoven, alliances, ,, an, increasing, arm...","[Europe, Allies, The Triple Entente, British E..."


# Filter data using the Main characters

In [68]:
def filter_entity(ent_list, character_df):
    # Your function logic here
    filtered_entities = [ent for ent in ent_list if ent in character_df['character'].values]
    return filtered_entities

In [69]:
print(df_sentences.columns)

Index(['sentence', 'entities'], dtype='object')


In [70]:
def filter_entity(ent_list, character_df):
    """
    Filters the entities from ent_list based on whether they appear in character_df['character'].

    Args:
        ent_list (list): A list of entities to filter.
        character_df (pd.DataFrame): A DataFrame containing a 'character' column with valid entities.

    Returns:
        list: A list of entities that match with those in the character DataFrame.
    """
    # Filter entities that exist in the character DataFrame
    filtered_entities = [ent for ent in ent_list if ent in character_df['character'].values]
    return filtered_entities

# Example usage:

# Assuming ent_list contains the extracted entities (strings)
ent_list = ["Germany", "France", "Einstein", "Russia", "Japan"]

# Assuming character_df is a DataFrame containing valid characters (including countries)
data = {'character': ["Germany", "Russia", "France", "Japan", "Italy"]}
character_df = pd.DataFrame(data)

# Filter the entities
filtered_result = filter_entity(ent_list, character_df)
print(filtered_result)

['Germany', 'France', 'Russia', 'Japan']


In [71]:
# Update the path to Desktop
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "Renu_Data", "relationships.csv")

# Try saving the file with error handling
try:
    df.to_csv(desktop_path, index=False)
    print(f"DataFrame created and saved at {desktop_path}.")
except Exception as e:
    print(f"Error saving the file: {e}")

DataFrame created and saved at C:\Users\chitr\Desktop\Renu_Data\relationships.csv.


# Create the relationships dataframe

In [72]:
import spacy
import pandas as pd
import os

# Load pre-trained spaCy NER model
nlp = spacy.load("en_core_web_sm")

# Correct path to the file in Renu_Data on Desktop
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "Renu_Data", "20th_century.txt")

# Load the text from the file
try:
    with open(desktop_path, 'r', encoding='utf-8') as file:
        text = file.read()
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please check if the file path is correct.")

# Process the text using spaCy NER
doc = nlp(text)

# List of countries (can be expanded with pycountry or custom list)
countries = ["Germany", "Russia", "France", "United States", "Japan", "China", "Italy", "India", "United Kingdom", "Canada"]

# Extract entities recognized as countries
entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE" and ent.text in countries]

# Create relationships DataFrame
relationships = []
for sent in doc.sents:
    sent_entities = [ent.text for ent in sent.ents if ent.label_ == "GPE" and ent.text in countries]
    if len(sent_entities) > 1:
        for i in range(len(sent_entities)):
            for j in range(i + 1, len(sent_entities)):
                relationships.append((sent_entities[i], sent_entities[j]))

# Convert to DataFrame
df = pd.DataFrame(relationships, columns=["Entity1", "Entity2"])

# Save the CSV to the desktop
output_path = os.path.join(os.path.expanduser("~"), "Desktop", "relationships.csv")
df.to_csv(output_path, index=False)

print(f"DataFrame created and saved at {output_path}.")


Error: [Errno 2] No such file or directory: 'C:\\Users\\chitr\\Desktop\\Renu_Data\\20th_century.txt'
Please check if the file path is correct.
DataFrame created and saved at C:\Users\chitr\Desktop\relationships.csv.
