# Importing data into Neo4j
## Compact Version
Builds a graph where each node only has one label

- Source: https://snap.stanford.edu/data/wikispeedia.html

In [36]:
import os
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
from urllib import request

We need to clean the files to remove comments first, since they caused trouble in read_csv(): 

In [37]:
if(os.getcwd()[-31:] != "DataRes-Research-Spring-22\data"):
    os.chdir("data")

# print(os.listdir(os.getcwd()))
for file in os.listdir(os.getcwd()):
    if os.path.isfile(file):
        with open(file, "r") as f:
            lines = f.readlines()
        with open(file, "w") as f:
            for line in lines:
                if len(line) > 1:
                    if line.strip("\n")[0] != "#":
                        f.write(line)

Reading in files as data frames:

In [38]:
members = pd.read_csv('articles.tsv', sep = "\t", header=None)
interactions = pd.read_csv('links.tsv', sep = "\t", header=None)
categories = pd.read_csv('categories.tsv', sep = "\t", header=None)

Checking to see if the data frames are loaded properly:

In [39]:
members.head()

Unnamed: 0,0
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
1,%C3%85land
2,%C3%89douard_Manet
3,%C3%89ire
4,%C3%93engus_I_of_the_Picts


In [40]:
interactions.head()

Unnamed: 0,0,1
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland


In [41]:
categories.head()

Unnamed: 0,0,1
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists


#### Connecting to the DBMS
If there is any authentification error after you run any queries, then your username and password are wrong

In [42]:
# Seen from :server status
uri = "bolt://localhost:7687"

# default user for graph database is neo4j
# auth = ("neo4j", "password")
auth = ("neo4j","abc")

driver = GraphDatabase.driver(uri = "bolt://localhost:7687", auth = auth)
driver.verify_connectivity()

  driver.verify_connectivity()


'Neo4j/4.4.7'

#### Adding nodes and relationships to graph

Creating nodes (without text as metadata):

In [43]:
# def create_nodes(tx, id, text) -> None:
#     """
#     parameters of create_nodes are metadata for nodes
#     """
#     query = """
#             MERGE (p:Page {id: $id, text: $text})
#             """
#     tx.run(query, id = id, text = text)

In [44]:
def create_nodes(tx, id) -> None:
    """
    parameters of create_nodes are metadata for nodes
    """
    query = """
            MERGE (p:Page {id: $id})
            """
    tx.run(query, id = id)

In [45]:
for id in members[0]:
    driver.session().write_transaction(create_nodes, id)

#Takes ~36 seconds to run

Adding categories as labels:

In [46]:
def add_label(tx, id, new_label):
    query = """
            MATCH (n {{ id: "{id_0}" }})
	    SET n : {new}
	    RETURN n
            """.format(id_0 = id,new = new_label)
    tx.run(query, id = id)

In [47]:
# for id in members[0]:
#     category_string = ((categories[categories[0] == id])[1].tolist())[0]
#     str = category_string.split(".")[1:]
#     if (len(str) < 1):
#         print(str)
    # for category_string in category:
    #     # print(category_string)
    #     str = category_string.split(".")[1:][0]
    #     if (str == "Page"):
    #         print("g")
        # if (category_string.split(".")[1:][0]) == "Page":
        #     driver.session().write_transaction(add_label, id, (category_string.split(".")[1:][0]))
        # else:
        #     driver.session().write_transaction(add_label, id, (category_string.split(".")[1:][1]))

In [48]:
for id in members[0]:
    if (len((categories[categories[0] == id])) >= 1):
        category_string = ((categories[categories[0] == id])[1].tolist())[0]
        str = category_string.split(".")[1:][0]
        driver.session().write_transaction(add_label, id, str)
            
# Takes ~56 seconds to run

In [49]:
# for id in members[0]:
#     category = (categories[categories[0] == id])[1].tolist()
#     for category_string in category:
#         if (category_string.split(".")[1:][0]) == "Page":
#             driver.session().write_transaction(add_label, id, (category_string.split(".")[1:][0]))
#         else:
#             driver.session().write_transaction(add_label, id, (category_string.split(".")[1:][1]))
        #for cat in category_string.split(".")[1:]:
        #     print(cat)
        #     print(len(category_string.split(".")[1:]))
            #driver.session().write_transaction(add_label, id, cat)
        #print()

# Takes ~56 seconds to run

Building links between related pages:

In [50]:
def create_relationships(tx, id1, id2) -> None:
    """
    Args:
        id1 is the id of first node
        id2 is id of second 
        NOTE: SRC-->DEST
    """
    query = """
            MATCH (p:Page {id: $id1})
            MATCH (n:Page {id: $id2})
            MERGE (p)-[l:LINKED]->(n)
            """
    tx.run(query, id1 = id1, id2 = id2)

# We use merge rather than create as merge will not create duplicates

In [51]:
for i in range(len(interactions)):
    driver.session().write_transaction(create_relationships, interactions.loc[i][0], interactions.loc[i][1])

# Takes ~6 minutes 39 seconds to run

In [52]:
driver.session().run("""
                    MATCH (p:Page)
                    REMOVE p:Page
                    """)

<neo4j.work.result.Result at 0x1f0e8b51c70>