In [None]:
import $ivy.`org.apache.spark::spark-sql:3.5.5`

In [None]:
// neo4j-spark-connector
/*
import coursierapi._
interp.repositories() ++= Seq(MavenRepository.of("https://repos.spark-packages.org/"))
interp.load.ivy(("neo4j" % "neo4j-spark-connector" % "5.3.3-s_2.12"))
*/

## Spark Session с подключением к Neo4j

In [None]:
import org.apache.spark.sql._

In [None]:
val spark = SparkSession
                .builder()
                .master("local[*]")
                .appName("SparkNeo4j")
                .config("spark.log.level", "WARN")
                .config("spark.jars.packages", "neo4j:neo4j-spark-connector:5.3.3-s_2.12")
                .config("neo4j.url", "neo4j://localhost:7687")
                .config("neo4j.authentication.type", "basic")
                .config("neo4j.authentication.basic.username", "neo4j")
                .config("neo4j.authentication.basic.password", "password")
                .getOrCreate()

import spark.implicits._

## Read data from Neo4j into Spark

### Movies

In [None]:
val movies = spark.read
                .format("org.neo4j.spark.DataSource")
                .option("labels", ":Movie")
                .load()

In [None]:
movies.printSchema()

In [None]:
movies.show(10, false)

### Person

In [None]:
val person = spark.read
                .format("org.neo4j.spark.DataSource")
                .option("labels", ":Person")
                .load()

In [None]:
person.printSchema()

In [None]:
person.show(10, false)

### ACTED_IN

In [None]:
val actedin = spark.read
                .format("org.neo4j.spark.DataSource")
                .option("relationship", "ACTED_IN")
                .option("relationship.source.labels", ":Person")
                .option("relationship.target.labels", ":Movie")
                .load()

In [None]:
actedin.printSchema()

In [None]:
actedin.show(10, false)

### DataFrame with nodes as map

In [None]:
val actedinMap = spark.read
                    .format("org.neo4j.spark.DataSource")
                    .option("relationship.nodes.map", true)
                    .option("relationship", "ACTED_IN")
                    .option("relationship.source.labels", ":Person")
                    .option("relationship.target.labels", ":Movie")
                    .load()

In [None]:
actedinMap.printSchema()

In [None]:
actedinMap.show(10, false)

### Directed

In [None]:
val directed = spark.read
                .format("org.neo4j.spark.DataSource")
                .option("relationship", "DIRECTED")
                .option("relationship.source.labels", ":Person")
                .option("relationship.target.labels", ":Movie")
                .load()

In [None]:
directed.printSchema()

In [None]:
directed.show(10, false)

### Read arbitrary data via Cypher query

In [None]:
val cypher = spark.read
                .format("org.neo4j.spark.DataSource")
                .option("query", """
                    // Extend Tom Hanks co-actors, to find co-co-actors who haven't worked with Tom Hanks
                    MATCH (tom:Person {name:"Tom Hanks"})-[:ACTED_IN]->(m)<-[:ACTED_IN]-(coActors),
                    (coActors)-[:ACTED_IN]->(m2)<-[:ACTED_IN]-(cocoActors)
                    WHERE NOT (tom)-[:ACTED_IN]->()<-[:ACTED_IN]-(cocoActors)
                        AND tom <> cocoActors
                    RETURN cocoActors.name AS Recommended, count(*) AS Strength
                    ORDER BY Strength DESC
                    """)
                .load()

In [None]:
cypher.printSchema()

In [None]:
cypher.show(10, false)

### Return all the actors that have also directed a movie.

In [None]:
val actorsDirectors = spark.read
                        .format("org.neo4j.spark.DataSource")
                        .option("query", """
                                MATCH (p:Person)
                                MATCH (p)-[:ACTED_IN]->(m:Movie)
                                MATCH (p)-[:DIRECTED]->(m1:Movie)
                                RETURN p.name AS name, collect(m.title) AS acted_in, collect(m1.title) AS directed
                                """)
                        .load()

In [None]:
actorsDirectors.printSchema()

In [None]:
actorsDirectors.show(10, false)

## Write data from Spark to Neo4j

In [None]:
val products = spark.read
                .format("csv")
                .option("inferSchema", true)
                .load("desktop-csv-import/products.csv")
                .withColumnsRenamed(Map("_c0" -> "id", "_c1" -> "name", "_c2" -> "price"))

In [None]:
products.printSchema()

In [None]:
products.show(10, false)

In [None]:
products.count()

In [None]:
val orders = spark.read
                .format("csv")
                .option("inferSchema", true)
                .option("header", true)
                .load("desktop-csv-import/orders.csv")
                .selectExpr("orderID AS id", "CAST(orderDate AS TIMESTAMP) AS date", "shipCountry")

In [None]:
orders.printSchema()

In [None]:
orders.show(10, false)

In [None]:
orders.count()

### Write nodes via label option

In [None]:
products.write
        .format("org.neo4j.spark.DataSource")
        .mode("append")
        .option("labels", ":Product")
        .save()

In [None]:
orders.write
      .format("org.neo4j.spark.DataSource")
      .mode("overwrite")
      .option("labels", ":Order")
      .option("schema.optimization.type", "NODE_CONSTRAINTS")
// this is necessary in order to specify what is the constraint field
      .option("node.keys", "id")
      .save()

### Write relationships via relationship option

In [None]:
val orderDetails = spark.read
                    .format("csv")
                    .option("inferSchema", true)
                    .option("header", true)
                    .load("desktop-csv-import/order-details.csv")

In [None]:
orderDetails.printSchema()

In [None]:
orderDetails.show(10, false)

In [None]:
orderDetails.count()

In [None]:
orderDetails.write
            .format("org.neo4j.spark.DataSource")
            .mode("overwrite")
            .option("relationship", "CONTAINS")
            .option("relationship.save.strategy", "keys")
            .option("relationship.source.labels", ":Product")
            .option("relationship.source.save.mode", "Match")
            .option("relationship.source.node.keys", "productID:id")
            .option("relationship.target.labels", ":Order")
            .option("relationship.target.save.mode", "Match")
            .option("relationship.target.node.keys", "orderID:id")
            .option("relationship.properties", "quantity:quantityOrdered")
            .save()

### Write custom graphs via Cypher Query

In [None]:
val actorOrders = Seq(
  ("Cuba Gooding Jr.", 1, Array(11, 42, 72), Array(1, 2, 3), "2022-06-07 00:00:00"),
  ("Tom Hanks", 2, Array(24, 55, 75), Array(3, 2, 1), "2022-06-06 00:00:00")
)

In [None]:
val actorOrdersDF = spark
                    .createDataFrame(actorOrders)
                    .withColumnsRenamed(
                        Map(
                            "_1" -> "actor_name",
                            "_2" -> "order_id",
                            "_3" -> "products",
                            "_4" -> "quantities",
                            "_5" -> "order_date"))

In [None]:
actorOrdersDF.printSchema()

In [None]:
actorOrdersDF.show(10, false)

In [None]:
actorOrdersDF.write
             .format("org.neo4j.spark.DataSource")
             .mode("overwrite")
             .option("query", """
                     MATCH (person:Person {name: event.actor_name})
                     MERGE (order:Order {id: event.order_id, date: datetime(replace(event.order_date, ' ', 'T'))})
                     MERGE (person)-[:CREATED]->(order)
                     WITH event, order
                     UNWIND range(0, size(event.products) - 1) AS index
                     MATCH (product:Product {id: event.products[index]})
                     MERGE (product)-[:CONTAINS{quantityOrdered: event.quantities[index]}]->(order)
                    """)
             .save()