## Knowledge Graph Construction, Embedding Processing, and Vector Index Creation in Neo4j

In [58]:
# folder directory
%cd '/###'

/content/drive/My Drive/spencer/SpencerData


In [59]:
# path to dataset
data_path ='/processed_sos_raw_text.csv'

In [33]:
# install the libraries
!pip3 install -q langchain
!pip3 install -q openai
!pip3 install -q neo4j

In [34]:
import os
import json
import pandas as pd

In [None]:
# load environment variables from a .env filor or  set it manually
!pip3 install python-dotenv
from dotenv import load_dotenv
load_dotenv()

# Set the OpenAI API key env variable manually
os.environ["OPENAI_API_KEY"] = "###"

print(os.environ["OPENAI_API_KEY"])

# Load the Processed Email Dataset

In [None]:
# load data into dataframe
df = pd.read_csv (data_path)
df

In [61]:
# confirm the total number of rows and columns
df.shape

(108753, 12)

In [None]:
# confirmed the datatype
df.info()

In [None]:
# Convert all columns to string except for the date columns and excluding the IDs except for Conversation_ID
columns_to_convert = [col for col in df.columns if col not in ["Time_Received", "Time_Modified"]] # "Revision_ID", "Email_ID", "Document_ID", "PersoEmail_Registered_By_IDnel_ID"

df[columns_to_convert] = df[columns_to_convert].astype('string')

# converting the 'Time_Received' and 'Time_Modified' to datetime format
df['Time_Received'] = pd.to_datetime(df['Time_Received'])
df['Time_Modified'] = pd.to_datetime(df['Time_Modified'])

df.head()

In [40]:
!pip3 install -U langchain-community --quiet

# Connect to Neo4j Database

In [42]:
# Neo4j DB credentials
NEO4J_URI="neo4j+s://###"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="####"
NEO4J_DATABASE="neo4j"

In [43]:
from langchain.graphs import Neo4jGraph

# connect to neo4j graph
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [44]:
from datetime import datetime

# function to sanitize and format data
def sanitize(text):
    return str(text).replace("'", "").replace('"', '').replace('{', '').replace('}', '').strip()

# Function to format datetime for Neo4j
def format_datetime(date_str):
    dt = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    return dt.strftime('%Y-%m-%dT%H:%M:%S')

In [45]:
!pip install -q tiktoken

In [None]:
!pip install -U langchain-openai --quiet

# Create email embedding needed for vector search

In [48]:
# created a copy
df = df.copy()
df

In [None]:
!pip install --upgrade openai --quiet

In [None]:
df.info()

In [51]:
import openai
from datetime import datetime

# Set the environment variable
os.environ['OPENAI_API_KEY'] = "###"

# Initialize the OpenAI client
openai.api_key = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI(api_key=openai.api_key)

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    try:
        response = client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except openai.error.InvalidRequestError as e:
        # Log the error
        print(f"Error: {e}")
        return None

def sanitize(text):
    return str(text).replace("'", "").replace('"', '').replace('{', '').replace('}', '').strip()

def format_datetime(date_str):
    dt = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    return dt.strftime('%Y-%m-%dT%H:%M:%S')

def concatenate_email_fields(row, max_length=10000):
    text = (
        f"Sender_Name: {sanitize(row['Sender_Name'])} | "
        f"Receiver_Name: {sanitize(row['Receiver_Name'])} | "
        f"Email_Subject: {sanitize(row['Email_Subject'])} | "
        f"Email_Content: {sanitize(row['Email_Content'])} | "
        f"Time_Received: {row['Time_Received'].isoformat()} | "
        f"Sender_Address: {sanitize(row['Sender_Address'])} | "
        f"Time_Modified: {row['Time_Modified'].isoformat()} | "
        f"Conversation_ID: {row['Conversation_ID']} | "
        f"Revision_ID: {row['Revision_ID']} | "
        f"Email_ID: {row['Email_ID']} | "
        f"Document_ID: {row['Document_ID']} | "
        f"Registration_Person_ID: {row['Registration_Person_ID']}"
    )
    return text[:max_length]

# Create a function to handle the embedding with error handling and logging
def process_row(row):
    text = concatenate_email_fields(row)
    return get_embedding(text)

# Apply the function to your dataframe
df['embedding'] = df.apply(lambda row: process_row(row), axis=1)

# Filter out rows where embedding is None
df = df[df['embedding'].notnull()]

# Save the dataframe with embeddings to a CSV file
df.to_csv('embedded_emails.csv', index=False)

print("Embeddings created and saved to csv")


Embeddings created and saved to csv


In [None]:
# inspect the column with embeddings
df.head()

In [None]:
# check datatype
df.info()

# Neo4j Graph Creation: Nodes, Relationships, and Properties


In [54]:
df["Time_Received"] = df["Time_Received"].astype(str)
df["Time_Modified"] = df["Time_Modified"].astype(str)

In [55]:

# Function to sanitize and format data
def sanitize(text):
    return str(text).replace("'", "").replace('"', '').replace('{', '').replace('}', '').strip()

# Function to format datetime for Neo4j
def format_datetime(date_str):
    dt = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    return dt.strftime('%Y-%m-%dT%H:%M:%S')

# Create Email nodes and relationships
for index, row in df.iterrows():
    conversation_id = sanitize(row['Conversation_ID'])
    revision_id = int(row['Revision_ID'])
    email_id = int(row['Email_ID'])
    document_id = int(row['Document_ID'])
    registration_person_id = int(row['Registration_Person_ID'])
    sender_name = sanitize(row['Sender_Name'])
    sender_address = sanitize(row['Sender_Address'])
    email_subject = sanitize(row['Email_Subject'])
    email_content = sanitize(row['Email_Content'])
    receiver_names = row['Receiver_Name'].strip('[]').split(', ')
    time_received = format_datetime(row['Time_Received'])
    time_modified = format_datetime(row['Time_Modified'])
    embedding = row['embedding']

    try:
        # Create Conversation node
        query = f'''
        MERGE (c:Conversation {{id: '{conversation_id}'}})
        '''
        graph.query(query)

        # Create Email node and link to Conversation
        query = f'''
        MERGE (e:Email {{
            revisionId: {revision_id},
            emailId: {email_id},
            documentId: {document_id},
            conversationId: '{conversation_id}',
            senderName: '{sender_name}',
            senderAddress: '{sender_address}',
            subject: '{email_subject}',
            content: '{email_content}',
            timeReceived: datetime("{time_received}"),
            timeModified: datetime("{time_modified}"),
            embedding: {embedding}
        }})
        MERGE (e)-[:PART_OF]->(c)
        '''
        graph.query(query)

        # Create Person nodes and SENT relationships
        query = f'''
        MERGE (sender:Person {{email: '{sender_name}'}})
        WITH sender
        MATCH (e:Email {{revisionId: {revision_id}}})
        MERGE (sender)-[:SENT]->(e)
        '''
        graph.query(query)

        # Create RECEIVED relationships using Receiver_Name column
        for receiver in receiver_names:
            receiver = sanitize(receiver.strip().strip("'"))
            query = f'''
            MERGE (receiver:Person {{email: '{receiver}'}})
            WITH receiver
            MATCH (e:Email {{revisionId: {revision_id}}})
            MERGE (receiver)-[:RECEIVED]->(e)
            '''
            graph.query(query)

    except Exception as e:
        print(f"Error processing record {revision_id}: {e}")

print("Graph data created and loaded successfully")

Graph data created and loaded successfully


# Creating a Vector Index on Email Embeddings in Neo4j


In [62]:
from neo4j import GraphDatabase

# Create the driver instance
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

# Define the Cypher command to create the vector index
create_index_cypher = """
CREATE VECTOR INDEX emailEmbeddings IF NOT EXISTS
FOR (e:Email)
ON (e.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
}
"""

# execute the Cypher command
with driver.session() as session:
    session.run(create_index_cypher)

print("Vector index 'emailEmbeddings' has been created (if it did not exist)")


Vector index 'emailEmbeddings' has been created (if it did not exist)
