In [48]:
import pandas as pd
from openai import OpenAI

df = pd.read_csv('CMG_EDA_1.csv')
# Return empty string if the value is NaN, apply to columns 'Head text', 'Relation text' and 'Tail text'
df['Head text'] = df['Head text'].apply(lambda x: '' if pd.isnull(x) else x)
df['Relation text'] = df['Relation text'].apply(lambda x: '' if pd.isnull(x) else x)
df['Tail text'] = df['Tail text'].apply(lambda x: '' if pd.isnull(x) else x)

# embedding = client.embeddings.create()

client = OpenAI(api_key="sk-proj-pjGUyMNiHPDiH7CLYv4rT3BlbkFJQpZiY1U0sYAqOATygpnJ")

# Generate embeddings for each row in the dataframe, using text-embedding-3-small, on column 'Head text', 'Relation text' and 'Tail text'
def generate_embeddings(df):
    embeddings = []
    for i in range(len(df)):
        head_text = df['Head text'][i]
        relation_text = df['Relation text'][i]
        tail_text = df['Tail text'][i]
        # embed and store the embeddings in separated lists
        head_embedding = client.embeddings.create(input=head_text, model="text-embedding-3-small")
        relation_embedding = client.embeddings.create(input=relation_text, model="text-embedding-3-small")
        tail_embedding = client.embeddings.create(input=tail_text, model="text-embedding-3-small")
        embeddings.append([head_embedding, relation_embedding, tail_embedding])
    return embeddings

# update the dataframe with the embeddings
def update_df(df, embeddings):
    df['Head embedding'] = [embedding[0] for embedding in embeddings]
    df['Relation embedding'] = [embedding[1] for embedding in embeddings]
    df['Tail embedding'] = [embedding[2] for embedding in embeddings]
    return df

# function to generate embeddings for a certain number of rows
def generate_embeddings_batch(df, start, end=None):
    # end = -1 means the end of the dataframe
    if end == -1:
        end = len(df)
    embeddings = []
    for i in range(start, end):
        head_text = df['Head text'][i]
        relation_text = df['Relation text'][i]
        tail_text = df['Tail text'][i]
        # embed and store the embeddings in separated lists
        head = client.embeddings.create(input=[head_text], model="text-embedding-3-small")
        head_embedding = [r.embedding for r in head.data]
        relation = client.embeddings.create(input=[relation_text], model="text-embedding-3-small")
        relation_embedding = [r.embedding for r in relation.data]
        if tail_text.strip():
            tail = client.embeddings.create(input=[tail_text], model="text-embedding-3-small")
            tail_embedding = [r.embedding for r in tail.data]
        else:
            tail_embedding = []
        embeddings.append([head_embedding, relation_embedding, tail_embedding])
    return embeddings




In [47]:
# check df head, relation and tail text if there is any NaN value
print(df['Head text'].isnull().sum())
print(df['Relation text'].isnull().sum())
print(df['Tail text'].isnull().sum())

# print the df Head, Relation, Tail for checking
print(df['Head text'])
print(df['Relation text'])
print(df['Tail text'])
# check number of items in the Head column, Relation column and Tail column
print(len(df['Head text']))
print(len(df['Relation text']))
print(len(df['Tail text']))


0
0
0
0      Flexible endoscopes equipped with video cameras
1                     specialized, longer enteroscopes
2                                            endoscopy
3                                            endoscopy
4                                            endoscopy
                            ...                       
993                                     fistula-in-ano
994                                  anorectal abscess
995                                  anorectal abscess
996                                     fistula-in-ano
997                               rectovaginal fistula
Name: Head text, Length: 998, dtype: object
0        can be used to view
1      can be used to assess
2                can combine
3                      gives
4            often outweighs
               ...          
993                      and
994                      and
995                  implies
996                  implies
997                  implies
Name: Relation text, Lengt

In [23]:
# embeddings = generate_embeddings(df)
# Test generate embeddings batch with 1 row
embeddings = generate_embeddings_batch(df, 0, 1)
print(embeddings)

[[[[0.023966971784830093, 0.011609974317252636, 0.0076818810775876045, -0.00022041054035071284, -0.05518006160855293, 0.0023935844656080008, -0.023904720321297646, -0.00251341937109828, 0.0343630351126194, -0.020804576575756073, 0.04581737890839577, -0.07171415537595749, 0.016471846029162407, -0.033192697912454605, 0.023705514147877693, -0.05049872025847435, -0.01980854570865631, -0.005829887930303812, -0.006309227552264929, 0.05981159955263138, 0.03491085022687912, -0.007451549172401428, -0.007327045779675245, -0.05503065511584282, -0.001314292661845684, -0.042729686945676804, -0.032943692058324814, 0.05453264340758324, 0.019173577427864075, 0.0067792292684316635, 0.032171767204999924, -0.03550846874713898, -0.03329230099916458, -0.0067543284967541695, -0.08067842572927475, 0.05234137549996376, 0.0028153411112725735, 0.0034487538505345583, -0.026021283119916916, 0.025672672316432, 0.0508224293589592, 0.005941941402852535, 0.018638210371136665, -0.00992917362600565, 0.00836042687296867

In [33]:
# pring embeddings's shape. note that embeddings is a list of lists of lists
head_embedding = embeddings[0][0]
relation_embedding = embeddings[0][1]
tail_embedding = embeddings[0][2]
print("head_embedding shape:", head_embedding[0])
print("relation_embedding shape:", relation_embedding[0])
print("tail_embedding shape:", tail_embedding[0])

head_embedding shape: [0.023966971784830093, 0.011609974317252636, 0.0076818810775876045, -0.00022041054035071284, -0.05518006160855293, 0.0023935844656080008, -0.023904720321297646, -0.00251341937109828, 0.0343630351126194, -0.020804576575756073, 0.04581737890839577, -0.07171415537595749, 0.016471846029162407, -0.033192697912454605, 0.023705514147877693, -0.05049872025847435, -0.01980854570865631, -0.005829887930303812, -0.006309227552264929, 0.05981159955263138, 0.03491085022687912, -0.007451549172401428, -0.007327045779675245, -0.05503065511584282, -0.001314292661845684, -0.042729686945676804, -0.032943692058324814, 0.05453264340758324, 0.019173577427864075, 0.0067792292684316635, 0.032171767204999924, -0.03550846874713898, -0.03329230099916458, -0.0067543284967541695, -0.08067842572927475, 0.05234137549996376, 0.0028153411112725735, 0.0034487538505345583, -0.026021283119916916, 0.025672672316432, 0.0508224293589592, 0.005941941402852535, 0.018638210371136665, -0.00992917362600565, 

In [None]:
embeddings_all = generate_embeddings_batch(df, 0, -1)
print(embeddings_all)

In [1]:
from neo4j import GraphDatabase
import pandas as pd

In [29]:
df_xlsx = pd.read_excel('output_embeddings.xlsx')

# connect to the Neo4j database
uri = "bolt://localhost:54621"
driver = GraphDatabase.driver(uri, auth=("neo4j", "saiyan94"))

def import_to_neo4j(df_xlsx):
    with driver.session() as session:
        for _, row in df_xlsx.iterrows():
            head_embedding = row['Head embedding']
            relation_embedding = row['Relation embedding']
            tail_embedding = row['Tail embedding']

            # store text
            head_text = row['Head text'].replace("'", "''")
            relation_text = row['Relation text'].replace("'", "''")
            # if tail text is not NaN, replace single quote with double single quote
            if pd.notnull(row['Tail text']):
                tail_text = row['Tail text'].replace("'", "''")

            # store Event ID, Paragraph ID, Article ID
            event_id = row['Event ID']
            paragraph_id = row['Paragraph ID']
            article_id = row['Article ID']

            # Extract and process head and tail labels as lists, replace spaces with underscores, replace "-" with "_"
            
            head_labels = [label.strip().replace(" ", "_").replace("-", "_").replace("'", "''") for label in row['Head labels'].split(",")]
            tail_labels = [label.strip().replace(" ", "_").replace("-", "_").replace("'", "''") for label in row['Tail labels'].split(",")]
            relation_label = row['Relation label'].replace(" ", "_").replace("-", "_").replace("'", "''")

            # Create Cypher labels for head and tail nodes
            head_labels_cypher = ':'.join(head_labels)
            tail_labels_cypher = ':'.join(tail_labels)

            # Create Cypher query to create head and tail nodes, with properties and relation
            query = f"""
            MERGE (head:{head_labels_cypher} {{
                text: $head_text, 
                embedding: $head_embedding, 
                event_id: $event_id, 
                paragraph_id: $paragraph_id, 
                article_id: $article_id
            }})
            MERGE (tail:{tail_labels_cypher} {{
                text: $tail_text, 
                embedding: $tail_embedding, 
                event_id: $event_id, 
                paragraph_id: $paragraph_id, 
                article_id: $article_id
            }})
            MERGE (head)-[:{relation_label} {{
                text: $relation_text,
                embedding: $relation_embedding,
                event_id: $event_id,
                paragraph_id: $paragraph_id,
                article_id: $article_id
            }}]->(tail)
            """

            parameters = {
                'head_text': head_text,
                'head_embedding': head_embedding,
                'tail_text': tail_text,
                'tail_embedding': tail_embedding,
                'event_id': event_id,
                'paragraph_id': paragraph_id,
                'article_id': article_id,
                'relation_text': relation_text,
                'relation_embedding': relation_embedding
            }
            session.run(query, parameters)




In [37]:
document_relation_query = f"""
MATCH (a), (b) 
WHERE a.article_id = b.article_id 
AND a.event_id <> b.event_id 
CREATE (a)-[:IS_IN_THE_SAME_ARTICLE]->(b)
"""

paragraph_relation_query = f"""
MATCH (a), (b)
WHERE a.paragraph_id = b.paragraph_id
AND a.event_id <> b.event_id
CREATE (a)-[:IS_IN_THE_SAME_PARAGRAPH]->(b)
"""


In [40]:
document_contain_paragraph_query = f"""
MATCH (n) 
WITH DISTINCT n.article_id AS article_id, n.paragraph_id AS paragraph_id 
WHERE article_id IS NOT NULL AND paragraph_id IS NOT NULL
MERGE (a:Article {{article_id: article_id}})
MERGE (p:Paragraph {{paragraph_id: paragraph_id}})
MERGE (a)-[:CONTAINS]->(p)
"""
paragraph_contain_event_query = f"""
MATCH (n) 
WITH n, n.paragraph_id AS paragraph_id, n.article_id AS article_id 
WHERE paragraph_id IS NOT NULL AND article_id IS NOT NULL
MERGE (e:Event {{event_id: n.event_id, paragraph_id: paragraph_id}})
MERGE (p:Paragraph {{paragraph_id: paragraph_id}})
MERGE (p)-[:CONTAINS]->(e)
"""
head_relation_tail_of_event_query = f"""
MATCH p=()-[r]->()
MATCH (e:Event)
WHERE r.event_id = e.event_id
RETURN p
"""

In [30]:
import_to_neo4j(df_xlsx)

In [34]:
with driver.session() as session:
    session.run(document_relation_query)

In [35]:
with driver.session() as session:
    session.run(paragraph_relation_query)