## Initial Setup

In [1]:
# load dependencies

import pandas as pd
from neo4j import GraphDatabase
import time

In [2]:
# load data

df_NODE = pd.read_csv('./data/node.csv')
df_RELATION = pd.read_csv('./data/relation.csv')

In [3]:
# neo4j connection

# connection class setting
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
    
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

# execute connection
con = Neo4jConnection("bolt://neo4j:7687", user="neo4j", pwd="1234")

## Database Prep

In [4]:
# Create a database
####################################################################################
# this would delete some of automated data for a graph application such as `charts`
####################################################################################

con.query('CREATE OR REPLACE DATABASE mlgit')
con.query('START DATABASE mlgit')

[]

In [5]:
# constraint setting

con.query("CREATE CONSTRAINT repository IF NOT EXISTS ON (r:Repository) ASSERT r.id IS UNIQUE", db='mlgit')

[]

## Create functions for data load

In [6]:
# insert data function
def insert_data(query, rows, db, batch_size = 1000):
    # function to handle the updating the neo4j database in batch mode
    
    total = 0
    batch = 0
    start = time.time()
    result = None
    
    while batch * batch_size < len(rows):
        
        res = con.query(query,
                       parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')},
                       db = db)
        total += res[0]['total']
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)
    
    return result

In [7]:
# add repository node
def add_repo(rows, db):
    # adds repository nodes to the neo4j graph
    query = '''
        UNWIND $rows AS row
        MERGE (r:Repository {id: row.NodeID, name: row.Repository, owner: row.OwnerName, about: row.About_Topics,
        star: row.StarsNorm, fork: row.ForkNorm, watcher: row.WatcherNorm})
        RETURN count(*) as total
    '''
    return insert_data(query, rows, db, batch_size = 1000)

# add repository connection
def add_connect_similar(rows, db):
    # adds connection between repo node to repo node by semantic similarity weight
    query = '''
        UNWIND $rows AS row
        MATCH (repo1:Repository {id: row.Node1_ID})
        MATCH (repo2:Repository {id: row.Node2_ID})
        MERGE (repo1)-[:SIMILAR {weight: row.Score}]-(repo2)
        RETURN count(*) as total
    '''
    return (insert_data(query, rows, db, batch_size = 1000))

## Upload data to graph database

In [8]:
add_repo(df_NODE, db = 'mlgit')

{'total': 989, 'batches': 1, 'time': 0.4336049556732178}


{'total': 989, 'batches': 1, 'time': 0.4336049556732178}

In [9]:
add_connect_similar(df_RELATION, db = 'mlgit')

{'total': 1000, 'batches': 1, 'time': 0.2432236671447754}
{'total': 1208, 'batches': 2, 'time': 0.27915000915527344}


{'total': 1208, 'batches': 2, 'time': 0.27915000915527344}