In [1]:
! sudo cp /home/jovyan/work/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar /usr/local/spark/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar

In [4]:
import pyspark
from pyspark.sql import SparkSession
# NEO4J  CONFIGURATION
bolt_url = "bolt://neo4j:7687"
# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
print(bolt_url)

bolt://neo4j:7687


In [7]:
cypher_query = '''
MATCH (s:Supplier)-[:SUPPLIES]->(p:Product) 
WHERE s.country = 'USA' AND p.discontinued=false 
RETURN s.contactName, s.companyName, s.country, p.productName, 
p.unitPrice, p.discontinued, p.unitsInStock
'''
df = spark.read.format("org.neo4j.spark.DataSource")\
  .option("url", bolt_url) \
  .option("query", cypher_query) \
  .load()
print(bolt_url)

bolt://neo4j:7687


In [9]:
df.printSchema()

root
 |-- s.contactName: string (nullable = true)
 |-- s.companyName: string (nullable = true)
 |-- s.country: string (nullable = true)
 |-- p.productName: string (nullable = true)
 |-- p.unitPrice: double (nullable = true)
 |-- p.discontinued: boolean (nullable = true)
 |-- p.unitsInStock: long (nullable = true)



In [10]:
df.toPandas()

                                                                                

Unnamed: 0,s.contactName,s.companyName,s.country,p.productName,p.unitPrice,p.discontinued,p.unitsInStock
0,Shelley Burke,New Orleans Cajun Delights,USA,Louisiana Hot Spiced Okra,17.0,False,4
1,Shelley Burke,New Orleans Cajun Delights,USA,Chef Anton's Cajun Seasoning,22.0,False,53
2,Shelley Burke,New Orleans Cajun Delights,USA,Louisiana Fiery Hot Pepper Sauce,21.05,False,76
3,Regina Murphy,Grandma Kelly's Homestead,USA,Grandma's Boysenberry Spread,25.0,False,120
4,Regina Murphy,Grandma Kelly's Homestead,USA,Northwoods Cranberry Sauce,40.0,False,6
5,Regina Murphy,Grandma Kelly's Homestead,USA,Uncle Bob's Organic Dried Pears,30.0,False,15
6,Cheryl Saylor,Bigfoot Breweries,USA,Sasquatch Ale,14.0,False,111
7,Cheryl Saylor,Bigfoot Breweries,USA,Laughing Lumberjack Lager,14.0,False,22
8,Cheryl Saylor,Bigfoot Breweries,USA,Steeleye Stout,18.0,False,20
9,Robb Merchant,New England Seafood Cannery,USA,Jack's New England Clam Chowder,9.65,False,85


In [20]:
file_name = "file:///home/jovyan/datasets/fudgemart/fudgemart-employees.json"
employees = spark.read.option("multiline", True).json(file_name)
employees.toPandas()
print(file_name)

file:///home/jovyan/datasets/fudgemart/fudgemart-employees.json


In [21]:
employees.printSchema()

root
 |-- employee_birthdate: string (nullable = true)
 |-- employee_department: string (nullable = true)
 |-- employee_firstname: string (nullable = true)
 |-- employee_fulltime: string (nullable = true)
 |-- employee_hiredate: string (nullable = true)
 |-- employee_hourlywage: double (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- employee_jobtitle: string (nullable = true)
 |-- employee_lastname: string (nullable = true)
 |-- employee_ssn: string (nullable = true)
 |-- employee_supervisor_id: long (nullable = true)
 |-- employee_termdate: string (nullable = true)



In [25]:
from pyspark.sql.functions import col, concat, lit
employees = spark.read.option("multiline", True).json(file_name) \
    .withColumn("employee_name",concat(col("Employee_firstname"),lit(" "), col("employee_lastname")))
    
tmp = employees.select("employee_id","employee_name", "employee_jobtitle", "employee_department")
print(file_name)
tmp.toPandas()

file:///home/jovyan/datasets/fudgemart/fudgemart-employees.json


Unnamed: 0,employee_id,employee_name,employee_jobtitle,employee_department
0,1,Arial Photo,Sales Associate,Electronics
1,2,Sal Ladd,Sales Associate,Electronics
2,3,Dustin Dawind,Sales Associate,Hardware
3,4,Sandi Shores,Sales Associate,Hardware
4,5,Isabelle Gunnering,Department Manager,Electronics
5,6,Lee Hvmeehom,Department Manager,Hardware
6,7,Allan Wrench,Sales Associate,Housewares
7,8,Ally Gator,Sales Associate,Sporting Goods
8,9,Alma Frienzergon,Sales Associate,Housewares
9,10,Artie Choke,Sales Associate,Hardware


In [26]:
cypher_merge = '''
MERGE (e:Employee
    {
        name: event.employee_name,
        dept: event.employee_department,
        jobtitle: event.employee_jobtitle,
        id: event.employee_id
    })
'''

tmp.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
    .option("url", bolt_url) \
    .option("query", cypher_merge) \
    .save()
print(cypher_merge)


MERGE (e:Employee
    {
        name: event.employee_name,
        dept: event.employee_department,
        jobtitle: event.employee_jobtitle,
        id: event.employee_id
    })



In [27]:
cypher_query = '''
MATCH (e:Employee)
RETURN e.name, e.dept, e.jobtitle, e.id;
'''
df = spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", bolt_url) \
    .option("query", cypher_query) \
    .load()

df.show()
print(bolt_url)

+------------------+--------------+------------------+----+
|            e.name|        e.dept|        e.jobtitle|e.id|
+------------------+--------------+------------------+----+
|       Arial Photo|   Electronics|   Sales Associate|   1|
|          Sal Ladd|   Electronics|   Sales Associate|   2|
|     Dustin Dawind|      Hardware|   Sales Associate|   3|
|      Sandi Shores|      Hardware|   Sales Associate|   4|
|Isabelle Gunnering|   Electronics|Department Manager|   5|
|      Lee Hvmeehom|      Hardware|Department Manager|   6|
|      Allan Wrench|    Housewares|   Sales Associate|   7|
|        Ally Gator|Sporting Goods|   Sales Associate|   8|
|  Alma Frienzergon|    Housewares|   Sales Associate|   9|
|       Artie Choke|      Hardware|   Sales Associate|  10|
|       Bette Alott|Sporting Goods|   Sales Associate|  11|
|      Bill Melator|Sporting Goods|   Sales Associate|  12|
|       Bob Enweave|Sporting Goods|   Sales Associate|  13|
|   Chris P. Nugget|   Electronics|   Sa

In [32]:
cipher_sql = '''
MATCH(e:Employee)
MATCH(s:Employee)
WHERE e.id = event.employee_id AND s.id = event.employee_supervisor_id
MERGE (s)-[:SUPERVISES]->(e)
'''

sup_employee = employees.select("employee_supervisor_id", "employee_id").orderBy("employee_supervisor_id")
sup_employee.toPandas()

sup_employee.write.format("org.neo4j.spark.DataSource") \
    .mode("Overwrite") \
    .option("url", bolt_url) \
    .option("query", cipher_sql) \
    .save()
print(cipher_sql)


MATCH(e:Employee)
MATCH(s:Employee)
WHERE e.id = event.employee_id AND s.id = event.employee_supervisor_id
MERGE (s)-[:SUPERVISES]->(e)

