In [3]:
import networkx as nx
import json
from py2neo import Graph, Node, Relationship
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType

# Load the data from the provided JSON file
data = []
with open('train.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Create the citation graph in Neo4j
graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
for row in data:
    citing_paper = Node("Paper", id=row['paper'])
    graph.merge(citing_paper, "Paper", "id")
    for cited_paper in row['reference']:
        cited_paper_node = Node("Paper", id=cited_paper)
        graph.merge(cited_paper_node, "Paper", "id")
        relationship = Relationship(citing_paper, "CITES", cited_paper_node)
        graph.merge(relationship)

# Run SimRank algorithm using Apache Spark
spark = SparkSession.builder.appName("SimRank").getOrCreate()

# Convert the Neo4j graph to a Spark DataFrame
nodes = [record['id'] for record in graph.run("MATCH (p:Paper) RETURN p.id")]
node_df = spark.createDataFrame(nodes, "string").toDF("node")
edges = [(record['source']['id'], record['target']['id']) 
         for record in graph.run("MATCH (p1:Paper)-[r:CITES]->(p2:Paper) RETURN {source: p1, target: p2}")]
edge_df = spark.createDataFrame(edges, ("source", "target")).cache()

# Define the SimRank function
@udf(DoubleType())
def simrank(source, target, c=0.9, max_iter=100, tolerance=1e-4):
    if source == target:
        return 1.0
    
    in_neighbors_u = [record['source']['id'] 
                      for record in graph.run(f"MATCH (p1:Paper {{id: '{source}'}})<-[r:CITES]-(p2:Paper) RETURN p2.id")]
    in_neighbors_v = [record['source']['id'] 
                      for record in graph.run(f"MATCH (p1:Paper {{id: '{target}'}})<-[r:CITES]-(p2:Paper) RETURN p2.id")]
    scale = c / (len(in_neighbors_u) * len(in_neighbors_v)) if in_neighbors_u and in_neighbors_v else 0
    sim = 0.0
    for u in in_neighbors_u:
        for v in in_neighbors_v:
            sim += simrank(u, v, c, max_iter, tolerance)
    return scale * sim

# Run SimRank with different importance factors
for c in [0.7, 0.8, 0.9]:
    sim_df = edge_df.crossJoin(node_df.alias("target")) \
        .withColumn("similarity", simrank("source", "target", lit(c))) \
        .select("source", "target", "similarity")
    
    # Get the SimRank similarity for the query nodes
    query_nodes = ["2982615777", "1556418098"]
    for node in query_nodes:
        print(f"SimRank similarity for node {node} (c={c}):")
        sim_df.filter(sim_df.source == node).orderBy("similarity", ascending=False).show()

# Cleanup
spark.stop()


ConnectionUnavailable: Connection has been closed

In [1]:
from neo4j import GraphDatabase
import json

# Load JSON data
with open('train.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Initialize Neo4j driver
driver = GraphDatabase.driver("bolt://localhost:7689", auth=("neo4j", "paras2003"))

def create_graph(tx, paper_id, references):
    # Create node for citing paper
    tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
    
    # Create nodes for referenced papers and the citation relationships
    for ref in references:
        tx.run("""
        MERGE (cited:Paper {id: $ref})
        MERGE (citing)-[:CITES]->(cited)
        """, citing=paper_id, ref=ref)

# Create the citation graph
with driver.session() as session:
    for entry in data:
        paper_id = entry["paper"]
        references = entry["reference"]
        session.write_transaction(create_graph, paper_id, references)


  session.write_transaction(create_graph, paper_id, references)


KeyboardInterrupt: 

In [8]:
spark.stop()

ConnectionRefusedError: [Errno 61] Connection refused

In [9]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame


# Initialize Spark session
spark = SparkSession.builder \
    .appName("SimRank") \
    .master("local[*]") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.1-spark3.0-s_2.12") \
    .getOrCreate()



# Load data from Neo4j
nodes_df = spark.read.format("neo4j").option("url", "bolt://localhost:7689").option("query", "MATCH (p:Paper) RETURN id(p) as id").load()
edges_df = spark.read.format("neo4j").option("url", "bolt://localhost:7689").option("query", "MATCH (a:Paper)-[:CITES]->(b:Paper) RETURN id(a) as src, id(b) as dst").load()

# Construct GraphFrame
graph = GraphFrame(nodes_df, edges_df)


ConnectionRefusedError: [Errno 61] Connection refused

In [None]:
from itertools import product
import numpy as np

def simrank(graph, query_node, C=0.8, max_iter=100, tolerance=0.0001):
    nodes = list(graph.vertices.select("id").toPandas()["id"])
    sim = {node: {other: (1.0 if node == other else 0.0) for other in nodes} for node in nodes}
    
    for i in range(max_iter):
        prev_sim = {u: sim[u].copy() for u in sim}
        for u, v in product(nodes, repeat=2):
            if u != v:
                in_neighbors_u = graph.edges.filter(f"dst = {u}").select("src").toPandas()["src"].tolist()
                in_neighbors_v = graph.edges.filter(f"dst = {v}").select("src").toPandas()["src"].tolist()
                
                if in_neighbors_u and in_neighbors_v:
                    scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
                    sim[u][v] = scale * sum(prev_sim[w][x] for w, x in product(in_neighbors_u, in_neighbors_v))

        # Check for convergence
        diff = sum(abs(sim[u][v] - prev_sim[u][v]) for u in nodes for v in nodes)
        if diff < tolerance:
            break
    
    return {v: sim[query_node][v] for v in nodes}

# Calculate and display SimRank scores for each C value
query_nodes = [2982615777, 1556418098]
results = {}

for C in [0.7, 0.8, 0.9]:
    results[C] = {}
    for query in query_nodes:
        results[C][query] = simrank(graph, query_node=query, C=C)


In [None]:
from neo4j import GraphDatabase
import numpy as np
from itertools import product
import pandas as pd

class SimRankCalculator:
    def __init__(self, neo4j_uri, neo4j_user, neo4j_password):
        # Initialize Neo4j connection
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

    def get_graph_data_from_neo4j(self):
        """Extract graph data from Neo4j"""
        with self.driver.session() as session:
            # Get all nodes
            nodes = session.run("MATCH (p:Paper) RETURN p.id AS id")
            nodes = [record["id"] for record in nodes]
            
            # Get all edges
            edges = session.run("""
                MATCH (p1:Paper)-[:CITES]->(p2:Paper)
                RETURN p1.id AS source, p2.id AS target
            """)
            edges = [(record["source"], record["target"]) for record in edges]
            
            return nodes, edges

    def create_adjacency_matrix(self, nodes, edges):
        """Create adjacency matrix from graph data"""
        node_to_idx = {node: idx for idx, node in enumerate(nodes)}
        n = len(nodes)
        adj_matrix = np.zeros((n, n))
        
        for source, target in edges:
            source_idx = node_to_idx[source]
            target_idx = node_to_idx[target]
            adj_matrix[source_idx][target_idx] = 1
            
        return adj_matrix, node_to_idx

    def simrank_calculate(self, adj_matrix, C, max_iterations=100, tolerance=1e-4):
        """Calculate SimRank similarities"""
        n = len(adj_matrix)
        sim_old = np.identity(n)
        
        for iteration in range(max_iterations):
            sim_new = np.zeros((n, n))
            
            # Compute similarities
            for i, j in product(range(n), range(n)):
                if i == j:
                    sim_new[i][j] = 1.0
                    continue
                
                # Get incoming neighbors
                i_in = np.where(adj_matrix[:, i] > 0)[0]
                j_in = np.where(adj_matrix[:, j] > 0)[0]
                
                if len(i_in) == 0 or len(j_in) == 0:
                    continue
                
                # Calculate similarity
                sum_sim = sum(sim_old[u][v] for u, v in product(i_in, j_in))
                sim_new[i][j] = (C / (len(i_in) * len(j_in))) * sum_sim
            
            # Check convergence
            if np.all(np.abs(sim_new - sim_old) < tolerance):
                print(f"Converged after {iteration + 1} iterations")
                return sim_new
            
            sim_old = sim_new.copy()
        
        print("Warning: Maximum iterations reached")
        return sim_old

    def get_similar_nodes(self, query_node, sim_matrix, nodes, node_to_idx, top_k=10):
        """Get top-k similar nodes for a query node"""
        if query_node not in node_to_idx:
            return []
        
        query_idx = node_to_idx[query_node]
        similarities = [(nodes[i], sim_matrix[query_idx][i]) 
                       for i in range(len(nodes))
                       if i != query_idx]
        
        # Sort by similarity score
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

    def analyze_and_report(self, query_nodes, C_values):
        """Run complete analysis and generate report"""
        try:
            # Get graph data
            print("Extracting graph data from Neo4j...")
            nodes, edges = self.get_graph_data_from_neo4j()
            print(f"Found {len(nodes)} nodes and {len(edges)} edges")
            
            # Create adjacency matrix
            print("Creating adjacency matrix...")
            adj_matrix, node_to_idx = self.create_adjacency_matrix(nodes, edges)
            
            # Run SimRank for different C values
            for C in C_values:
                print(f"\n=== SimRank Analysis with C = {C} ===")
                
                # Calculate SimRank
                sim_matrix = self.simrank_calculate(adj_matrix, C)
                
                # Get similar nodes for each query node
                for query_node in query_nodes:
                    print(f"\nQuery Node: {query_node}")
                    similar_nodes = self.get_similar_nodes(
                        query_node, sim_matrix, nodes, node_to_idx
                    )
                    
                    print("Top 10 Most Similar Papers:")
                    print("Paper ID\t\tSimilarity Score")
                    print("-" * 40)
                    for node_id, score in similar_nodes:
                        print(f"{node_id}\t{score:.4f}")
                    
                    # Calculate statistics
                    scores = [score for _, score in similar_nodes]
                    if scores:
                        print("\nSimilarity Statistics:")
                        print(f"Average Similarity: {np.mean(scores):.4f}")
                        print(f"Maximum Similarity: {np.max(scores):.4f}")
                        print(f"Minimum Similarity: {np.min(scores):.4f}")
        
        finally:
            self.close()

    def close(self):
        """Clean up resources"""
        self.driver.close()

def main():
    # Initialize calculator
    calculator = SimRankCalculator(
        "bolt://localhost:7689",  # Note: Changed port to default Neo4j port
        "neo4j",
        "paras2003"
    )
    
    # Define query nodes and C values
    query_nodes = ['2982615777', '1556418098']
    C_values = [0.7, 0.8, 0.9]
    
    # Run analysis
    calculator.analyze_and_report(query_nodes, C_values)

if __name__ == "__main__":
    main()

Extracting graph data from Neo4j...
Found 614941 nodes and 11287 edges
Creating adjacency matrix...

=== SimRank Analysis with C = 0.7 ===


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
import numpy as np
from itertools import product
import pandas as pd
from neo4j import GraphDatabase

class CitationSimRank:
    def __init__(self, neo4j_uri, neo4j_user, neo4j_password):
        # Initialize Neo4j connection
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        
        # Initialize Spark session
        self.spark = SparkSession.builder \
            .appName("CitationSimRank") \
            .getOrCreate()

    def create_graph_schema(self):
        """Create Neo4j constraints and indexes"""
        with self.driver.session() as session:
            # Create constraints for Paper nodes
            session.run("CREATE CONSTRAINT paper_id IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE")

    def load_citation_data(self, data_df):
        """
        Load citation data into Neo4j
        data_df: DataFrame with columns 'paper' and 'reference' (list)
        """
        with self.driver.session() as session:
            # Create all paper nodes first
            session.run("""
                UNWIND $papers AS paper
                MERGE (p:Paper {id: paper})
            """, papers=data_df['paper'].unique().tolist())
            
            # Create nodes for all referenced papers
            referenced_papers = set()
            for refs in data_df['reference']:
                if refs:  # Check if reference list is not empty
                    referenced_papers.update(refs)
            
            session.run("""
                UNWIND $papers AS paper
                MERGE (p:Paper {id: paper})
            """, papers=list(referenced_papers))
            
            # Create citation relationships
            for _, row in data_df.iterrows():
                if row['reference']:  # Only process non-empty reference lists
                    session.run("""
                        MATCH (citing:Paper {id: $citing_id})
                        UNWIND $cited_ids AS cited_id
                        MATCH (cited:Paper {id: cited_id})
                        MERGE (citing)-[:CITES]->(cited)
                    """, citing_id=row['paper'], cited_ids=row['reference'])

    def get_graph_data(self):
        """Extract graph data from Neo4j"""
        with self.driver.session() as session:
            # Get all nodes
            nodes = session.run("MATCH (p:Paper) RETURN p.id AS id")
            nodes = [record["id"] for record in nodes]
            
            # Get all edges (citations)
            edges = session.run("""
                MATCH (p1:Paper)-[:CITES]->(p2:Paper)
                RETURN p1.id AS source, p2.id AS target
            """)
            edges = [(record["source"], record["target"]) for record in edges]
            
            return nodes, edges

    def create_adjacency_matrix(self, nodes, edges):
        """Create adjacency matrix using Spark"""
        # Convert to Spark DataFrame
        edges_df = self.spark.createDataFrame(edges, ["source", "target"])
        
        # Create node mapping
        node_to_idx = {node: idx for idx, node in enumerate(nodes)}
        n = len(nodes)
        
        # Broadcast node mapping
        node_map_broadcast = self.spark.sparkContext.broadcast(node_to_idx)
        
        # Convert to matrix using Spark
        adj_matrix = np.zeros((n, n))
        
        # Collect edges and update matrix
        for row in edges_df.collect():
            source_idx = node_map_broadcast.value[row.source]
            target_idx = node_map_broadcast.value[row.target]
            adj_matrix[source_idx][target_idx] = 1
            
        return adj_matrix, node_to_idx

    def simrank_calculate(self, adj_matrix, C, max_iterations=100, tolerance=1e-4):
        """Calculate SimRank similarities using Spark"""
        n = len(adj_matrix)
        sim_old = np.identity(n)
        
        # Convert to Spark DataFrame for distributed computation
        for iteration in range(max_iterations):
            sim_new = np.zeros((n, n))
            
            # Compute similarities
            for i, j in product(range(n), range(n)):
                if i == j:
                    sim_new[i][j] = 1.0
                    continue
                
                # Get incoming neighbors
                i_in = np.where(adj_matrix[:, i] > 0)[0]
                j_in = np.where(adj_matrix[:, j] > 0)[0]
                
                if len(i_in) == 0 or len(j_in) == 0:
                    continue
                
                # Calculate similarity
                sum_sim = sum(sim_old[u][v] for u, v in product(i_in, j_in))
                sim_new[i][j] = (C / (len(i_in) * len(j_in))) * sum_sim
            
            # Check convergence
            if np.all(np.abs(sim_new - sim_old) < tolerance):
                print(f"Converged after {iteration + 1} iterations")
                return sim_new
            
            sim_old = sim_new.copy()
        
        print("Warning: Maximum iterations reached")
        return sim_old

    def get_similar_papers(self, query_node, sim_matrix, nodes, node_to_idx, top_k=10):
        """Get top-k similar papers for a query paper"""
        if query_node not in node_to_idx:
            return []
        
        query_idx = node_to_idx[query_node]
        similarities = [(nodes[i], sim_matrix[query_idx][i]) 
                       for i in range(len(nodes))
                       if i != query_idx]
        
        # Sort by similarity score
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

    def analyze_citations(self, query_nodes, C_values, data_df):
        """Run complete citation analysis"""
        try:
            # Create graph schema
            print("Creating Neo4j schema...")
            self.create_graph_schema()
            
            # Load data into Neo4j
            print("Loading citation data into Neo4j...")
            self.load_citation_data(data_df)
            
            # Get graph data
            print("Extracting graph data...")
            nodes, edges = self.get_graph_data()
            print(f"Found {len(nodes)} papers and {len(edges)} citations")
            
            # Create adjacency matrix
            print("Creating adjacency matrix...")
            adj_matrix, node_to_idx = self.create_adjacency_matrix(nodes, edges)
            
            # Run SimRank for different C values
            results = {}
            for C in C_values:
                print(f"\n=== SimRank Analysis with C = {C} ===")
                
                # Calculate SimRank
                sim_matrix = self.simrank_calculate(adj_matrix, C)
                
                # Get similar papers for each query node
                C_results = {}
                for query_node in query_nodes:
                    print(f"\nQuery Paper ID: {query_node}")
                    similar_papers = self.get_similar_papers(
                        query_node, sim_matrix, nodes, node_to_idx
                    )
                    
                    print("Top 10 Most Similar Papers:")
                    print("Paper ID\t\tSimilarity Score")
                    print("-" * 40)
                    for paper_id, score in similar_papers:
                        print(f"{paper_id}\t{score:.4f}")
                    
                    # Calculate statistics
                    scores = [score for _, score in similar_papers]
                    if scores:
                        print("\nSimilarity Statistics:")
                        print(f"Average Similarity: {np.mean(scores):.4f}")
                        print(f"Maximum Similarity: {np.max(scores):.4f}")
                        print(f"Minimum Similarity: {np.min(scores):.4f}")
                    
                    C_results[query_node] = similar_papers
                
                results[C] = C_results
            
            return results
        
        finally:
            self.close()

    def close(self):
        """Clean up resources"""
        self.driver.close()
        self.spark.stop()

def main():
    # Sample usage
    neo4j_config = {
        "uri": "bolt://localhost:7687",
        "user": "neo4j",
        "password": "your_password"
    }
    
    # Initialize calculator
    calculator = CitationSimRank(**neo4j_config)
    
    # Define query parameters
    query_nodes = ['2982615777', '1556418098']
    C_values = [0.7, 0.8, 0.9]
    
    # Load your citation data into a pandas DataFrame
    # Example format:
    # data_df = pd.DataFrame({
    #     'paper': ['paper1', 'paper2'],
    #     'reference': [['ref1', 'ref2'], ['ref3']]
    # })
    
    # Run analysis
    results = calculator.analyze_citations(query_nodes, C_values, data_df)

if __name__ == "__main__":
    main()

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from neo4j import GraphDatabase
import numpy as np
from itertools import product

class SparkSimRankCalculator:
    def __init__(self, neo4j_uri, neo4j_user, neo4j_password):
        # Initialize Neo4j connection
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        
        # Initialize Spark session
        self.spark = SparkSession.builder \
            .appName("CitationSimRank") \
            .config("spark.driver.memory", "4g") \
            .config("spark.executor.memory", "4g") \
            .getOrCreate()

    def get_graph_data_from_neo4j(self):
        """Extract graph data from Neo4j"""
        with self.driver.session() as session:
            # Get all nodes
            nodes = session.run("MATCH (p:Paper) RETURN p.id AS id")
            nodes = [record["id"] for record in nodes]
            
            # Get all edges
            edges = session.run("""
                MATCH (p1:Paper)-[:CITES]->(p2:Paper)
                RETURN p1.id AS source, p2.id AS target
            """)
            edges = [(record["source"], record["target"]) for record in edges]
            
            return nodes, edges

    def create_spark_dataframes(self, nodes, edges):
        """Convert graph data to Spark DataFrames"""
        # Create nodes DataFrame
        nodes_df = self.spark.createDataFrame(
            [(node,) for node in nodes],
            ["id"]
        )
        
        # Create edges DataFrame
        edges_df = self.spark.createDataFrame(
            edges,
            ["source", "target"]
        )
        
        return nodes_df, edges_df

    def simrank_spark(self, nodes_df, edges_df, C, max_iterations=100, tolerance=1e-4):
        """
        Implement SimRank using Spark
        """
        # Convert to pandas for efficient matrix operations
        nodes = [row.id for row in nodes_df.collect()]
        edges = [(row.source, row.target) for row in edges_df.collect()]
        
        # Create adjacency matrix
        node_to_idx = {node: idx for idx, node in enumerate(nodes)}
        n = len(nodes)
        adj_matrix = np.zeros((n, n))
        
        for source, target in edges:
            source_idx = node_to_idx[source]
            target_idx = node_to_idx[target]
            adj_matrix[source_idx][target_idx] = 1
        
        # Initialize similarity matrix
        sim_old = np.identity(n)
        
        # SimRank iterations
        for iteration in range(max_iterations):
            sim_new = np.zeros((n, n))
            
            # Compute similarities
            for i, j in product(range(n), range(n)):
                if i == j:
                    sim_new[i][j] = 1.0
                    continue
                
                # Get incoming neighbors
                i_in = np.where(adj_matrix[:, i] > 0)[0]
                j_in = np.where(adj_matrix[:, j] > 0)[0]
                
                if len(i_in) == 0 or len(j_in) == 0:
                    continue
                
                # Calculate similarity
                sum_sim = 0.0
                for u, v in product(i_in, j_in):
                    sum_sim += sim_old[u][v]
                
                sim_new[i][j] = (C / (len(i_in) * len(j_in))) * sum_sim
            
            # Check convergence
            if np.all(np.abs(sim_new - sim_old) < tolerance):
                print(f"Converged after {iteration + 1} iterations")
                return sim_new, nodes, node_to_idx
            
            sim_old = sim_new.copy()
        
        print("Warning: Maximum iterations reached")
        return sim_old, nodes, node_to_idx

    def get_similar_nodes(self, query_node, sim_matrix, nodes, node_to_idx, top_k=10):
        """Get top-k similar nodes for a query node"""
        if query_node not in node_to_idx:
            return []
        
        query_idx = node_to_idx[query_node]
        similarities = [(nodes[i], sim_matrix[query_idx][i]) 
                       for i in range(len(nodes))
                       if i != query_idx]
        
        # Sort by similarity score
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

    def close(self):
        """Clean up resources"""
        self.driver.close()
        self.spark.stop()


# Initialize calculator
calculator = SparkSimRankCalculator(
    "bolt://localhost:7689",
    "neo4j",
    "paras2003"
)

try:
    # Get graph data
    print("Extracting graph data from Neo4j...")
    nodes, edges = calculator.get_graph_data_from_neo4j()
    print(f"Found {len(nodes)} nodes and {len(edges)} edges")
    
    # Create Spark DataFrames
    nodes_df, edges_df = calculator.create_spark_dataframes(nodes, edges)
    
    # Query nodes to analyze
    query_nodes = ['2982615777', '1556418098']
    C_values = [0.7, 0.8, 0.9]
    
    # Run SimRank for different C values
    for C in C_values:
        print(f"\n=== SimRank Analysis with C = {C} ===")
        
        # Calculate SimRank
        sim_matrix, nodes, node_to_idx = calculator.simrank_spark(
            nodes_df, edges_df, C
        )
        
        # Get similar nodes for each query node
        for query_node in query_nodes:
            print(f"\nQuery Node: {query_node}")
            similar_nodes = calculator.get_similar_nodes(
                query_node, sim_matrix, nodes, node_to_idx
            )
            
            print("Top 10 Most Similar Papers:")
            print("Paper ID\t\tSimilarity Score")
            print("-" * 40)
            for node_id, score in similar_nodes:
                print(f"{node_id}\t{score:.4f}")
            
            # Calculate statistics
            scores = [score for _, score in similar_nodes]
            if scores:
                print("\nSimilarity Statistics:")
                print(f"Average Similarity: {np.mean(scores):.4f}")
                print(f"Maximum Similarity: {np.max(scores):.4f}")
                print(f"Minimum Similarity: {np.min(scores):.4f}")

finally:
    # Clean up
    calculator.close()

ConnectionRefusedError: [Errno 61] Connection refused

In [None]:
for C, query_results in results.items():
    print(f"\nSimRank Results for C={C}")
    for query, similarities in query_results.items():
        print(f"\nQuery Node ID: {query}")
        sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
        for node, score in sorted_similarities[:10]:  # Top 10 similar nodes
            print(f"Node: {node}, Similarity: {score}")


In [4]:
from neo4j import GraphDatabase
import json
from tqdm import tqdm

# Load JSON data
with open('train.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Initialize Neo4j driver
driver = GraphDatabase.driver("bolt://localhost:7689", auth=("neo4j", "paras2003"))

def create_graph(tx, paper_id, references):
    # Create node for citing paper
    tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
    
    # Create nodes for referenced papers and the citation relationships
    for ref in references:
        tx.run("""
        MERGE (cited:Paper {id: $ref})
        MERGE (citing)-[:CITES]->(cited)
        """, citing=paper_id, ref=ref)

# Create the citation graph with progress tracking
with driver.session() as session:
    for entry in tqdm(data, desc="Processing Papers"):
        paper_id = entry["paper"]
        references = entry["reference"]
        session.write_transaction(create_graph, paper_id, references)


  session.write_transaction(create_graph, paper_id, references)
Processing Papers: 100%|██████████| 564340/564340 [2:07:35<00:00, 73.72it/s]     


In [37]:
import os
from pyspark.sql import SparkSession

def setup_spark_neo4j_connection(
    neo4j_url="bolt://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="paras2003",
    spark_jars_dir="~/spark-jars"
):
    """
    Set up a Spark session with Neo4j connection configuration
    
    Parameters:
    -----------
    neo4j_url : str
        Neo4j bolt URL
    neo4j_user : str
        Neo4j username
    neo4j_password : str
        Neo4j password
    spark_jars_dir : str
        Directory containing the required JAR files
    """
    
    # Expand user directory path
    jars_dir = os.path.expanduser(spark_jars_dir)
    
    # Create a comma-separated list of JAR files
    jar_files = [
        os.path.join(jars_dir, f) for f in [
            "neo4j-connector-apache-spark_2.12-5.3.0.jar",
            "neo4j-java-driver-4.4.9.jar",
            "netty-all-4.1.97.Final.jar",
            "reactive-streams-1.0.4.jar",
            "netty-handler-4.1.97.Final.jar",
            "netty-buffer-4.1.97.Final.jar",
            "netty-common-4.1.97.Final.jar",
            "netty-transport-4.1.97.Final.jar",
            "netty-resolver-4.1.97.Final.jar",
            "netty-codec-4.1.97.Final.jar"
        ]
    ]
    
    # Join jar paths with comma
    jars = ",".join(jar_files)
    
    # Stop any existing Spark session
    SparkSession.builder.getOrCreate().stop()
    
    # Create new Spark session
    spark = (SparkSession.builder
        .appName("Neo4j Spark Connection")
        .config("spark.jars", jars)  # Use local JARs instead of packages
        .config("spark.neo4j.bolt.url", neo4j_url)
        .config("spark.neo4j.bolt.user", neo4j_user)
        .config("spark.neo4j.bolt.password", neo4j_password)
        .master("local[*]")
        .getOrCreate())
    
    return spark

def query_neo4j(spark, query, database="neo4j"):
    """
    Execute a Cypher query against Neo4j using Spark
    
    Parameters:
    -----------
    spark : SparkSession
        Active Spark session
    query : str
        Cypher query to execute
    database : str
        Neo4j database name
    """
    try:
        df = (spark.read
            .format("org.neo4j.spark.DataSource")
            .option("url", spark.conf.get("spark.neo4j.bolt.url"))
            .option("authentication.basic.username", 
                   spark.conf.get("spark.neo4j.bolt.user"))
            .option("authentication.basic.password", 
                   spark.conf.get("spark.neo4j.bolt.password"))
            .option("query", query)
            .option("database", database)
            .load())
        
        return df
    
    except Exception as e:
        print(f"Error executing query: {str(e)}")
        raise
    
if __name__ == "__main__":
    # Example usage
    try:
        # Initialize Spark session
        spark = setup_spark_neo4j_connection(
            neo4j_url="bolt://localhost:7687",  # Update port if needed
            neo4j_user="neo4j",
            neo4j_password="paras2003"
        )
        
        # Example query
        query = """
        MATCH (p:Paper)-[:CITES]->(cited:Paper)
        RETURN p.id AS citing_paper, cited.id AS cited_paper
        """
        
        # Execute query
        result_df = query_neo4j(spark, query)
        result_df.show()
        
    finally:
        spark.stop()

24/11/13 13:02:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/13 13:02:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/13 13:02:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/13 13:02:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/13 13:02:24 ERROR SparkContext: Failed to add /Users/parasdhiman/spark-jars/neo4j-connector-apache-spark_2.12-5.3.0.jar to Spark environment
java.io.FileNotFoundException: Jar /Users/parasdhiman/spark-jars/neo4j-connector-apache-spark_2.12-5.3.0.jar not found
	at org.apache.spark.SparkContext.addLocalJarFile$1(SparkContext.scala:2095)
	at org.apache.spark.SparkContext.addJar(SparkContext.scala:2151)
	at org.apache.spark.SparkContext.$anonfun$new$15(SparkContext.scala:521)
	at org.apache.spark.SparkContext.$anonfun$new$15$adapted(SparkContext.scala:521)
	at scala.collection.mutable.ResizableA

Error executing query: An error occurred while calling o504.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: org.neo4j.spark.DataSource. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMetho

Py4JJavaError: An error occurred while calling o504.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: org.neo4j.spark.DataSource. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: org.neo4j.spark.DataSource.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 15 more


In [30]:
from pyspark.sql import SparkSession
import os
from pathlib import Path

# Connection details
url = "bolt://localhost:7687"
username = "neo4j"
password = "paras2003"
dbname = "neo4j"

# Create directory for JAR if needed
home_dir = str(Path.home())
spark_jars_dir = os.path.join(home_dir, "spark_jars")
os.makedirs(spark_jars_dir, exist_ok=True)

# Specify the JAR path
jar_name = "neo4j-connector-apache-spark_2.12-5.3.2_for_spark_3.jar"
jar_path = os.path.join(spark_jars_dir, jar_name)

# Initialize Spark session with Neo4j connector
spark = (SparkSession.builder
    .appName("Neo4j Spark Connection")
    # Option 1: Using local JAR file (make sure the JAR is downloaded to the specified path)
    .config("spark.jars", jar_path)
    # Option 2: Alternative method using packages (uncomment if preferred)
    #.config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.2")
    # Neo4j connection configs
    .config("neo4j.url", url)
    .config("neo4j.authentication.basic.username", username)
    .config("neo4j.authentication.basic.password", password)
    .config("neo4j.database", dbname)
    .getOrCreate()
)

# Define the Cypher query
query = """
MATCH (p:Paper)-[:CITES]->(cited:Paper)
RETURN p.id AS citing_paper, cited.id AS cited_paper
"""

# Load data from Neo4j into Spark DataFrame
df = spark.read.format("org.neo4j.spark.DataSource") \
    .option("query", query) \
    .load()

# Show the data in the DataFrame
df.show()

Py4JJavaError: An error occurred while calling o111.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: org.neo4j.spark.DataSource. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: org.neo4j.spark.DataSource.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 15 more


In [1]:
from neo4j import GraphDatabase
import json
from tqdm import tqdm

# Load JSON data
with open('train.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Initialize Neo4j driver
driver = GraphDatabase.driver("bolt://localhost:7689", auth=("neo4j", "paras2003"))

# Function to create nodes and relationships
def create_graph_batch(tx, batch):
    for paper_id, references in batch:
        tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
        for ref in references:
            tx.run("""
            MERGE (cited:Paper {id: $ref})
            MERGE (p)-[:CITES]->(cited)
            """, paper_id=paper_id, ref=ref)

# Batch processing with progress tracking
batch_size = 100  # Adjust as needed
with driver.session() as session:
    batch = []
    for entry in tqdm(data, desc="Processing Papers"):
        paper_id = entry["paper"]
        references = entry["reference"]
        batch.append((paper_id, references))
        
        if len(batch) >= batch_size:
            session.execute_write(create_graph_batch, batch)
            batch = []
    
    # Process remaining data
    if batch:
        session.execute_write(create_graph_batch, batch)

# Close the Neo4j driver
driver.close()


Processing Papers: 100%|██████████| 564340/564340 [05:36<00:00, 1674.74it/s]


In [None]:
from neo4j import GraphDatabase
import json
from tqdm import tqdm

# Load JSON data
with open('train.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Initialize Neo4j driver
driver = GraphDatabase.driver("bolt://localhost:7689", auth=("neo4j", "paras2003"))

# Function to create nodes and relationships in batches
def create_graph_batch(tx, batch):
    for paper_id, references in batch:
        # Create the Paper node
        tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
        
        # Create referenced papers and relationships
        for ref in references:
            tx.run("""
            MERGE (cited:Paper {id: $ref})
            MERGE (p)-[:CITES]->(cited)
            """, paper_id=paper_id, ref=ref)

# Batch processing with progress tracking
batch_size = 100  # Adjust as needed
with driver.session() as session:
    batch = []
    for entry in tqdm(data, desc="Processing Papers"):
        paper_id = entry["paper"]
        references = entry["reference"]
        batch.append((paper_id, references))
        
        if len(batch) >= batch_size:
            session.execute_write(create_graph_batch, batch)
            batch = []
    
    # Process remaining data
    if batch:
        session.execute_write(create_graph_batch, batch)

# Close the Neo4j driver
driver.close()



Processing Papers: 100%|██████████| 564340/564340 [06:00<00:00, 1565.92it/s]
24/11/13 10:17:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Py4JJavaError: An error occurred while calling o287.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: neo4j. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at jdk.internal.reflect.GeneratedMethodAccessor19.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: neo4j.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 14 more


In [35]:
from pyspark.sql import SparkSession

# Replace with the actual connection URI and credentials
url = "bolt://localhost:7689"
username = "neo4j"
password = "paras2003"
dbname = "neo4j"

spark = (
    SparkSession.builder.config("neo4j.url", url)
    .config("neo4j.authentication.basic.username", username)
    .config("neo4j.authentication.basic.password", password)
    # .config("neo4j.database", dbname)
    .getOrCreate()
)
# from pyspark.sql import SparkSession
# from graphframes import GraphFrame

# # Initialize Spark session with Neo4j Spark connector
# spark = (
#     SparkSession.builder
#     .appName("SimRank")
#     .config("spark.jars.packages", "org.neo4j:neo4j-spark-connector:5.3.2")  # Ensure the Neo4j Spark connector is included
#     .config("neo4j.url", "bolt://localhost:7689")
#     .config("neo4j.authentication.basic.username", "neo4j")
#     .config("neo4j.authentication.basic.password", "paras2003")
#     .config("neo4j.database", "SimRank")  # Optional, specify database if needed
#     .getOrCreate()
# )

# # Load data from Neo4j into DataFrames
# # 1. Nodes: Retrieve all Paper nodes
# nodes_df = spark.read.format("neo4j") \
#     .option("url", "bolt://localhost:7689") \
#     .option("query", "MATCH (p:Paper) RETURN p.id as id") \
#     .load()

# # 2. Edges: Retrieve all 'CITES' relationships
# edges_df = spark.read.format("neo4j") \
#     .option("url", "bolt://localhost:7689") \
#     .option("query", "MATCH (a:Paper)-[:CITES]->(b:Paper) RETURN a.id as src, b.id as dst") \
#     .load()

# # Construct the GraphFrame
# graph = GraphFrame(nodes_df, edges_df)

# # Show the graph structure (nodes and edges)
# print("Vertices (Papers):")
# graph.vertices.show()  # Show Paper nodes
# print("Edges (Cites relationships):")
# graph.edges.show()     # Show CITES relationships

# # Example: Calculate the number of neighbors for each Paper (degree)
# degree_df = graph.degrees
# degree_df.show()

# # Example: Find the shortest path between two nodes (papers) - if applicable
# # This will require you to define start and end papers (e.g., "Paper1", "Paper2")
# # Replace 'Paper1' and 'Paper2' with actual node IDs or paper IDs.
# shortest_paths_df = graph.shortestPaths(landmarks=["Paper1", "Paper2"])
# shortest_paths_df.show()

# # Example: SimRank computation (optional, assuming you have a similarity measure)
# # Note: You may need additional steps depending on the specific algorithm used for SimRank or other graph algorithms.
# # Placeholder for SimRank or other graph analysis functions.
# # GraphFrames does not have direct support for SimRank, but you can implement it based on node similarity, for example.

# # Example of filtering nodes with degree greater than a threshold (e.g., nodes with at least 3 neighbors)
# filtered_nodes = degree_df.filter(degree_df["degree"] >= 3)
# filtered_nodes.show()

# # Optionally, you can perform more graph-related tasks here such as finding connected components,
# # graph traversal, or clustering.


In [39]:
from pyspark.sql import SparkSession

url = "bolt://localhost:7689"
username = "neo4j"
password = "paras2003"
dbname = "neo4j"

spark = (
    SparkSession.builder
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:4.0.0")
    .config("neo4j.url", url)
    .config("neo4j.authentication.basic.username", username)
    .config("neo4j.authentication.basic.password", password)
    .getOrCreate()
)


# Example query to retrieve data from Neo4j
# This query will match all nodes and return them; modify as needed
df = spark.read.format("org.neo4j.spark.DataSource") \
    .option("url", url) \
    .option("authentication.basic.username", username) \
    .option("authentication.basic.password", password) \
    .option("query", "MATCH (n) RETURN n LIMIT 10") \
    .load()

# Show the data
df.show()


24/11/13 10:29:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Py4JJavaError: An error occurred while calling o373.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: org.neo4j.spark.DataSource. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at jdk.internal.reflect.GeneratedMethodAccessor19.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: org.neo4j.spark.DataSource.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 14 more
