In [12]:
import json
from neo4j import GraphDatabase
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, array_contains
import networkx as nx
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from math import sqrt
# Connect to Neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "paras2003"))

# # Create citation graph in Neo4j
# with driver.session() as session:
#     for row in data:
#         paper_id = row['paper']
#         references = row['reference']
        
#         # Create paper node
#         session.run("CREATE (p:Paper {id: $paper_id})", paper_id=paper_id)
        
#         # Create citation relationships
#         for ref in references:
#             session.run("MATCH (p1:Paper {id: $paper_id}) "
#                        "MATCH (p2:Paper {id: $ref}) "
#                        "CREATE (p1)-[:CITES]->(p2)", paper_id=paper_id, ref=ref)

# Load citation graph into NetworkX
G = nx.DiGraph()
# with driver.session() as session:
#     result = session.run("MATCH (p:Paper) RETURN p.id AS id, size([(p)-[:CITES]->() | 1]) AS in_degree, size([(()-[:CITES]->(p) | 1]) AS out_degree")
#     for record in result:
#         G.add_node(record["id"], in_degree=record["in_degree"], out_degree=record["out_degree"])
#     result = session.run("MATCH (p1:Paper)-[r:CITES]->(p2:Paper) RETURN p1.id, p2.id")
#     for record in result:
#         G.add_edge(record["p1.id"], record["p2.id"])
# Corrected Cypher query to fetch in-degree and out-degree for each paper
# Corrected Cypher query to fetch in-degree and out-degree for each paper
with driver.session() as session:
    result = session.run("MATCH (p:Paper) RETURN p.id AS id, COUNT((p)-[:CITES]->()) AS in_degree, COUNT(()-[:CITES]->(p)) AS out_degree")
    for record in result:
        G.add_node(record["id"], in_degree=record["in_degree"], out_degree=record["out_degree"])

    result = session.run("MATCH (p1:Paper)-[r:CITES]->(p2:Paper) RETURN p1.id, p2.id")
    for record in result:
        G.add_edge(record["p1.id"], record["p2.id"])

# Now proceed with the SimRank computation as before.


# Now proceed with the SimRank computation as before.


# Run SimRank algorithm on the citation graph
spark = SparkSession.builder.appName("SimRank").getOrCreate()
sc = spark.sparkContext

@udf(DoubleType())
def simrank_score(src_id, dst_id):
    """
    Calculate the SimRank score between two nodes.
    
    Parameters:
    src_id (str): Source node ID
    dst_id (str): Destination node ID
    
    Returns:
    float: SimRank score between the source and destination nodes
    """
    if src_id == dst_id:
        return 1.0
    
    src_neighbors = [n for n in G.neighbors(src_id)]
    dst_neighbors = [n for n in G.neighbors(dst_id)]
    
    if not src_neighbors or not dst_neighbors:
        return 0.0
    
    score = 0.0
    for src_neighbor in src_neighbors:
        for dst_neighbor in dst_neighbors:
            score += simrank_score(src_neighbor, dst_neighbor)
    
    return importance_factor * score / (len(src_neighbors) * len(dst_neighbors))

def simrank(G, source, importance_factor, max_iterations, tolerance):
    """
    Run the SimRank algorithm on the citation graph.
    
    Parameters:
    G (networkx.DiGraph): Citation graph
    source (list): List of source node IDs to compute similarity for
    importance_factor (float): SimRank importance factor
    max_iterations (int): Maximum number of iterations
    tolerance (float): Convergence tolerance
    
    Returns:
    list: List of SimRank scores for the source nodes
    """
    # Convert NetworkX graph to Spark DataFrame
    nodes = [(n, d["in_degree"], d["out_degree"]) for n, d in G.nodes(data=True)]
    edges = [(u, v) for u, v in G.edges()]
    node_schema = StructType([
        StructField("id", StringType(), True),
        StructField("in_degree", IntegerType(), True),
        StructField("out_degree", IntegerType(), True)
    ])
    edge_schema = StructType([
        StructField("src", StringType(), True),
        StructField("dst", StringType(), True)
    ])
    node_df = spark.createDataFrame(nodes, schema=node_schema)
    edge_df = spark.createDataFrame(edges, schema=edge_schema)
    
    # Compute SimRank scores
    sim_scores = node_df.crossJoin(node_df.alias("other"))
    sim_scores = sim_scores.withColumn("similarity", simrank_score(col("id"), col("other.id")))
    sim_scores = sim_scores.filter(array_contains(source, col("id")) | array_contains(source, col("other.id")))
    return sim_scores.collect()

# Run SimRank with different importance factors
source_nodes = [2982615777, 1556418098]
for importance_factor in [0.7, 0.8, 0.9]:
    print(f"SimRank with importance factor: {importance_factor}")
    sim_scores = simrank(G, source_nodes, importance_factor, 1000, 0.0001)
    for row in sim_scores:
        print(f"Similarity between {row['id']} and {row['other.id']}: {row['similarity']}")

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: A pattern expression should only be used in order to test the existence of a pattern. It should therefore only be used in contexts that evaluate to a boolean, e.g. inside the function exists() or in a WHERE-clause. No other uses are allowed, instead they should be replaced by a pattern comprehension. (line 1, column 42 (offset: 41))
"MATCH (p:Paper) RETURN p.id AS id, COUNT((p)-[:CITES]->()) AS in_degree, COUNT(()-[:CITES]->(p)) AS out_degree"
                                          ^}

In [26]:
from pyspark.sql import SparkSession
from neo4j import GraphDatabase
import pandas as pd
import os
from datetime import datetime
from tqdm.auto import tqdm
from collections import defaultdict
from itertools import product

class CitationGraphAnalyzer:
    def __init__(self, neo4j_uri="bolt://localhost:7687", 
                 neo4j_user="neo4j", neo4j_password="paras2003"):
        """Initialize with Neo4j and Spark connections"""
        # Initialize Neo4j connection
        self.driver = GraphDatabase.driver(neo4j_uri, 
                                         auth=(neo4j_user, neo4j_password))
        
        # Initialize Spark session
        self.spark = SparkSession.builder \
            .appName("Citation Graph Analysis") \
            .config("spark.driver.memory", "4g") \
            .config("spark.executor.memory", "4g") \
            .getOrCreate()
    
    def create_neo4j_graph(self, papers_data):
        """Create graph in Neo4j from citation data"""
        with self.driver.session() as session:
            # Clear existing data
            session.run("MATCH (n) DETACH DELETE n")
            
            # Create paper nodes
            for paper_id in papers_data.keys():
                session.run("""
                    CREATE (p:Paper {id: $paper_id})
                """, paper_id=paper_id)
            
            # Create citation relationships
            for citing_paper, cited_papers in papers_data.items():
                if cited_papers:  # Only create edges if there are references
                    for cited_paper in cited_papers:
                        session.run("""
                            MATCH (citing:Paper {id: $citing_id})
                            MATCH (cited:Paper {id: $cited_id})
                            CREATE (citing)-[:CITES]->(cited)
                        """, citing_id=citing_paper, cited_id=cited_paper)
                        
    def get_graph_data(self):
        """Extract graph data from Neo4j for Spark processing"""
        with self.driver.session() as session:
            # Get all citation relationships
            result = session.run("""
                MATCH (p1:Paper)-[:CITES]->(p2:Paper)
                RETURN p1.id as source, p2.id as target
            """)
            edges = [(record["source"], record["target"]) for record in result]
            
            return edges
    
    from pyspark.sql import functions as F

    def compute_simrank_similarity(self, query_node, target_node, in_neighbors_dict, C=0.9, max_iterations=10, tolerance=1e-4):
        """Calculate SimRank similarity between query and target nodes using Spark and Neo4j data"""
        if query_node == target_node:
            return 1.0  # self-similarity is always 1.0

        in_neighbors_a = in_neighbors_dict.get(query_node, [])
        in_neighbors_b = in_neighbors_dict.get(target_node, [])
        
        if not in_neighbors_a or not in_neighbors_b:
            return 0.0
        
        # Initial similarity score
        sim_scores = defaultdict(lambda: defaultdict(float))
        for node in set(in_neighbors_a + in_neighbors_b):
            sim_scores[node][node] = 1.0

        # Perform SimRank iterations
        for _ in range(max_iterations):
            new_scores = defaultdict(lambda: defaultdict(float))
            max_diff = 0.0
            
            # Compute similarity between in-neighbors
            for a_in in in_neighbors_a:
                for b_in in in_neighbors_b:
                    if a_in == b_in:
                        new_scores[a_in][b_in] = 1.0
                        continue
                    
                    in_a_in = in_neighbors_dict.get(a_in, [])
                    in_b_in = in_neighbors_dict.get(b_in, [])
                    
                    if not in_a_in or not in_b_in:
                        continue
                    
                    # Calculate new similarity score based on neighbors
                    similarity_sum = sum(sim_scores[i][j] for i, j in product(in_a_in, in_b_in))
                    new_sim = (C / (len(in_a_in) * len(b_in))) * similarity_sum
                    new_scores[a_in][b_in] = new_sim
                    new_scores[b_in][a_in] = new_sim
                    
                    # Track the maximum change for convergence
                    max_diff = max(max_diff, abs(new_sim - sim_scores[a_in][b_in]))
            
            # Update scores
            sim_scores = new_scores
            if max_diff < tolerance:
                break

        # Final similarity between query_node and target_node
        similarity_sum = sum(sim_scores[i][j] for i in in_neighbors_a for j in in_neighbors_b)
        return (C / (len(in_neighbors_a) * len(in_neighbors_b))) * similarity_sum


    def analyze_citation_graph(self, query_nodes, decay_factors, output_dir="simrank_results"):
        """Run SimRank analysis on citation graph"""
        os.makedirs(output_dir, exist_ok=True)
        assert os.path.isdir(output_dir), f"Could not create or access directory: {output_dir}"

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Get graph data from Neo4j
        edges = self.get_graph_data()
        edges_df = self.spark.createDataFrame(edges, ["source", "target"])
        
        # Cache in-neighbors
        in_neighbors_dict = self._cache_in_neighbors(edges_df)
        
        all_results = []
        
        for C in decay_factors:
            print(f"\nComputing SimRank with decay factor C = {C}")
            
            results = []
            # Get all unique nodes
            all_nodes = set(row['source'] for row in edges_df.select("source").distinct().collect())
            all_nodes.update(row['target'] for row in edges_df.select("target").distinct().collect())
            
            for query_node in tqdm(query_nodes, desc="Processing query nodes"):
                node_results = []
                for target_node in tqdm(all_nodes, desc=f"Computing similarities for node {query_node}", leave=False):
                    sim = self.compute_simrank_similarity(
                        query_node,
                        target_node,
                        in_neighbors_dict,
                        C
                    )
                    node_results.append((query_node, target_node, sim))
                results.extend(node_results)
            
            results_df = pd.DataFrame(results, columns=['query_node', 'target_node', 'similarity'])
            results_df['decay_factor'] = C
            all_results.append(results_df)
            
            # Save intermediate results
            output_path = f"{output_dir}/simrank_results_C{C}_{timestamp}.csv"
            results_df.to_csv(output_path, index=False)
        
        return self._save_and_summarize_results(all_results, query_nodes, decay_factors, timestamp, output_dir)

    
    def _cache_in_neighbors(self, edges_df):
        """Cache in-neighbors for all nodes"""
        in_neighbors = edges_df.groupBy('target').agg(F.collect_list('source').alias('in_neighbors'))
        return {row['target']: row['in_neighbors'] for row in in_neighbors.collect()}

    
    def _save_and_summarize_results(self, all_results, query_nodes, decay_factors, timestamp, output_dir):
        """Save and summarize final results"""
        final_results = pd.concat(all_results, ignore_index=True)
        
        # Save complete results
        final_path = f"{output_dir}/simrank_all_results_{timestamp}.csv"
        final_results.to_csv(final_path, index=False)
        
        # Generate top results
        top_results = []
        for C in decay_factors:
            for query in query_nodes:
                mask = (final_results['decay_factor'] == C) & (final_results['query_node'] == query)
                subset = final_results[mask].nlargest(10, 'similarity')
                subset = subset.copy()
                subset['rank'] = range(1, len(subset) + 1)
                top_results.append(subset)
        
        top_results_df = pd.concat(top_results, ignore_index=True)
        top_path = f"{output_dir}/simrank_top_results_{timestamp}.csv"
        top_results_df.to_csv(top_path, index=False)
        
        self._print_summary(top_results_df, decay_factors, query_nodes)
        
        return final_results, top_results_df
    
    def _print_summary(self, top_results_df, decay_factors, query_nodes):
        """Print summary of results"""
        print("\nTop 5 most similar nodes for each query node and decay factor:")
        for C in decay_factors:
            print(f"\nDecay factor C = {C}")
            for query in query_nodes:
                print(f"\nQuery node: {query}")
                mask = (top_results_df['decay_factor'] == C) & (top_results_df['query_node'] == query)
                top_5 = top_results_df[mask & (top_results_df['similarity'] > 0)].head(5)
                
                if not top_5.empty:
                    print(top_5[['target_node', 'similarity', 'rank']].to_string(index=False))
                else:
                    print("No similar nodes found.")
            
    def close(self):
        """Close the Neo4j and Spark sessions"""
        self.driver.close()
        self.spark.stop()


# Initialize analyzer
analyzer = CitationGraphAnalyzer()

try:
    # Create Neo4j graph
    print("Creating Neo4j graph...")
    analyzer.create_neo4j_graph(papers_data)
    
    # Run analysis
    query_nodes = [2982615777, 1556418098]
    decay_factors = [0.7, 0.8, 0.9]
    
    print("Running SimRank analysis...")
    final_results, top_results = analyzer.analyze_citation_graph(
        query_nodes=query_nodes,
        decay_factors=decay_factors
    )
    
finally:
    # Clean up connections
    analyzer.close()

Creating Neo4j graph...


KeyboardInterrupt: 

In [18]:

import json
from neo4j import GraphDatabase
from pyspark.sql import SparkSession
from itertools import product

# Step 1: Load JSON Data
# Open and read each line as a separate JSON object
with open('train.json', 'r') as file:
    data = [json.loads(line) for line in file]

# Step 2: Neo4j Database Connection
uri = "bolt://localhost:7687"  # update if different
username = "neo4j"  # replace with your Neo4j username
password = "paras2003"  # replace with your Neo4j password

# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(username, password))

# Step 3: Create Graph in Neo4j
def create_graph(tx, paper_id, references):
    # Create a paper node
    tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
    # For each reference, create a citation edge
    for ref_id in references:
        tx.run("""
            MERGE (p:Paper {id: $paper_id})
            MERGE (r:Paper {id: $ref_id})
            MERGE (p)-[:CITES]->(r)
            """, paper_id=paper_id, ref_id=ref_id)

# Add data to Neo4j
# with driver.session() as session:
#     for entry in data:
#         paper_id = entry['paper']
#         references = entry.get('reference', [])
#         session.write_transaction(create_graph, paper_id, references)

# Step 4: Export Neo4j Data to CSV
export_query = """
CALL apoc.export.csv.query("MATCH (p1:Paper)-[:CITES]->(p2:Paper) RETURN p1.id AS paper, p2.id AS reference", "citation_graph.csv", {})
"""
with driver.session() as session:
    session.run(export_query)

# Step 5: Initialize Spark Session
spark = SparkSession.builder.appName("SimRank").getOrCreate()

# Load citation graph CSV into a Spark DataFrame
df = spark.read.csv("citation_graph.csv", header=True, inferSchema=True)
df.show()

# Step 6: Define SimRank Algorithm
def simrank(df, query_nodes, C=0.8, max_iter=10, tol=1e-4):
    # Initialize similarity scores with 1.0 for self-similarity and 0.0 for all others
    sim = {(u, v): 1.0 if u == v else 0.0 for u in query_nodes for v in query_nodes}
    
    # Dictionary to store incoming neighbors for each node
    neighbors = df.rdd.map(lambda row: (row["reference"], row["paper"])) \
                      .groupByKey() \
                      .mapValues(list) \
                      .collectAsMap()
    
    for _ in range(max_iter):
        new_sim = {}
        for u, v in product(query_nodes, repeat=2):
            if u == v:
                new_sim[(u, v)] = 1.0
            else:
                u_neighbors = neighbors.get(u, [])
                v_neighbors = neighbors.get(v, [])
                if u_neighbors and v_neighbors:
                    scale = C / (len(u_neighbors) * len(v_neighbors))
                    new_sim[(u, v)] = scale * sum(sim.get((w, x), 0) for w in u_neighbors for x in v_neighbors)
                else:
                    new_sim[(u, v)] = 0.0
        
        # Check for convergence
        diff = sum(abs(new_sim[(u, v)] - sim[(u, v)]) for u, v in product(query_nodes, repeat=2))
        if diff < tol:
            break
        sim = new_sim
    
    return sim

# Step 7: Run SimRank Algorithm with Different Values of C
results = {}
for C_value in [0.7, 0.8, 0.9]:
    results[C_value] = simrank(df, query_nodes=[2982615777, 1556418098], C=C_value)

# Step 8: Display the Results
for C_value, sim_scores in results.items():
    print(f"Results for C = {C_value}:")
    for (u, v), score in sim_scores.items():
        print(f"Similarity between {u} and {v}: {score}")


ClientError: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `apoc.export.csv.query` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}

In [21]:
import json
from neo4j import GraphDatabase
from pyspark.sql import SparkSession
from itertools import product
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Load JSON Data
with open('train.json', 'r') as file:
    data = [json.loads(line) for line in file]

# Step 2: Extract paper abstracts or content for topic modeling
# Assuming that 'abstract' is a field in your data, adjust as needed
documents = [entry['abstract'] for entry in data if 'abstract' in entry]

# Step 3: Topic Modeling using LDA (Latent Dirichlet Allocation)
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Print out the topics discovered by LDA
terms = vectorizer.get_feature_names_out()
for index, topic in enumerate(lda.components_):
    print(f"Topic {index + 1}:")
    print([terms[i] for i in topic.argsort()[-10:]])  # top 10 words for each topic
    print("\n")

# Step 4: Neo4j Database Connection
uri = "bolt://localhost:7687"  # update if different
username = "neo4j"  # replace with your Neo4j username
password = "paras2003"  # replace with your Neo4j password

# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(username, password))

# Step 5: Create Graph in Neo4j
def create_graph(tx, paper_id, references, topics):
    # Create a paper node with topic information
    tx.run("MERGE (p:Paper {id: $paper_id, topics: $topics})", paper_id=paper_id, topics=topics)
    # For each reference, create a citation edge
    for ref_id in references:
        tx.run("""
            MERGE (p:Paper {id: $paper_id})
            MERGE (r:Paper {id: $ref_id})
            MERGE (p)-[:CITES]->(r)
            """, paper_id=paper_id, ref_id=ref_id)

# Step 6: Add data to Neo4j
with driver.session() as session:
    for i, entry in enumerate(data):
        paper_id = entry['paper']
        references = entry.get('reference', [])
        # For each paper, assign the topic distribution from LDA
        topic_distribution = lda.transform(vectorizer.transform([entry['abstract']])).flatten()
        topics = list(topic_distribution)  # Storing topic distribution for the paper
        session.write_transaction(create_graph, paper_id, references, topics)




KeyboardInterrupt: 

In [23]:
import json
from neo4j import GraphDatabase
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from tqdm import tqdm
import time
from itertools import product

# Step 1: Load the data from the JSON file
with open('train.json', 'r') as file:
    data = [json.loads(line) for line in file]

# Step 2: Neo4j Database Connection
uri = "bolt://localhost:7687"  # Neo4j URI
username = "neo4j"  # Replace with your username
password = "paras2003"  # Replace with your password

# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(username, password))

# Step 3: Create Graph in Neo4j
def create_graph(tx, paper_id, references):
    # Create a paper node
    tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
    # Create a directed edge for each reference
    for ref_id in references:
        tx.run("""
            MERGE (p:Paper {id: $paper_id})
            MERGE (r:Paper {id: $ref_id})
            MERGE (p)-[:CITES]->(r)
            """, paper_id=paper_id, ref_id=ref_id)

# Add data to Neo4j with tqdm progress bar
with driver.session() as session:
    start_time = time.time()  # Track time for Neo4j insertion
    for entry in tqdm(data, desc="Inserting into Neo4j", unit="row"):
        paper_id = entry['paper']
        references = entry.get('reference', [])
        session.write_transaction(create_graph, paper_id, references)
    print(f"Neo4j insertion took {time.time() - start_time:.2f} seconds")

# Step 4: Initialize Spark session
spark = SparkSession.builder \
    .appName("SimRank") \
    .config("spark.jars", "/Users/parasdhiman/Desktop/assmt/BDA/assmt3/graphframes-0.8.1-spark3.0-s_2.12.jar") \
    .getOrCreate()

# Step 5: Load the graph data from Neo4j
nodes_df = spark.read.format("neo4j") \
    .option("url", "bolt://localhost:7687") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "paras2003") \
    .option("query", "MATCH (p:Paper) RETURN p.id AS id") \
    .load()

edges_df = spark.read.format("neo4j") \
    .option("url", "bolt://localhost:7687") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "paras2003") \
    .option("query", "MATCH (p:Paper)-[:CITES]->(r:Paper) RETURN p.id AS src, r.id AS dst") \
    .load()

# Step 6: Create a GraphFrame
g = GraphFrame(nodes_df, edges_df)

# Step 7: SimRank Algorithm Implementation (simplified version)
def simrank(g, C=0.7, max_iterations=1000, tolerance=0.0001):
    # This function computes SimRank for all node pairs
    sim_matrix = {}
    nodes = g.vertices.select("id").rdd.flatMap(lambda x: x).collect()

    # Initialize the similarity matrix
    for u in nodes:
        sim_matrix[u] = {}
        for v in nodes:
            sim_matrix[u][v] = 1.0 if u == v else 0.0

    # Iterate to update similarities
    for _ in tqdm(range(max_iterations), desc="SimRank Iterations", unit="iteration"):
        prev_sim_matrix = {u: sim_matrix[u].copy() for u in sim_matrix}
        for u in nodes:
            for v in nodes:
                if u != v:
                    in_neighbors_u = [row['src'] for row in g.edges.filter(g.edges.dst == u).collect()]
                    in_neighbors_v = [row['src'] for row in g.edges.filter(g.edges.dst == v).collect()]
                    scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) if in_neighbors_u and in_neighbors_v else 0
                    sim_matrix[u][v] = scale * sum(sim_matrix.get(w, {}).get(x, 0) for w, x in product(in_neighbors_u, in_neighbors_v))

        # Check for convergence
        max_diff = max(abs(sim_matrix[u][v] - prev_sim_matrix[u][v]) for u in sim_matrix for v in sim_matrix[u])
        if max_diff < tolerance:
            break

    return sim_matrix

# Step 8: Compute SimRank for different values of C and query nodes
query_nodes = [2982615777, 1556418098]
C_values = [0.7, 0.8, 0.9]

for query in query_nodes:
    for C in C_values:
        print(f"\nRunning SimRank for query node {query} with C={C}...")
        start_time = time.time()
        sim_matrix = simrank(g, C)
        end_time = time.time()
        print(f"SimRank computation for C={C} took {end_time - start_time:.2f} seconds")
        
        # Print top-5 similar nodes (optional)
        top_similar_nodes = sorted(sim_matrix[query].items(), key=lambda x: x[1], reverse=True)[:5]
        print(f"Top 5 similar nodes for query {query} (C={C}):")
        for node, similarity in top_similar_nodes:
            print(f"Node {node}: Similarity = {similarity:.4f}")


KeyboardInterrupt: 

In [25]:
import json
import pandas as pd
from tqdm import tqdm
from neo4j import GraphDatabase
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from itertools import product

# # Step 1: Load JSON Data into Neo4j
# with open('train.json', 'r') as file:
#     data = [json.loads(line) for line in file]

# Neo4j connection parameters
uri = "bolt://localhost:7687"
username = "neo4j"
password = "paras2003"

# # Create Graph in Neo4j
# def create_graph(tx, paper_id, references):
#     # Create a paper node
#     tx.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)
#     # For each reference, create a citation edge
#     for ref_id in references:
#         tx.run("""
#             MERGE (p:Paper {id: $paper_id})
#             MERGE (r:Paper {id: $ref_id})
#             MERGE (p)-[:CITES]->(r)
#             """, paper_id=paper_id, ref_id=ref_id)

# Initialize Neo4j connection
driver = GraphDatabase.driver(uri, auth=(username, password))

# # Add data to Neo4j
# with driver.session() as session:
#     for entry in tqdm(data, desc="Loading data into Neo4j", ncols=100):
#         paper_id = entry['paper']
#         references = entry.get('reference', [])
        # session.write_transaction(create_graph, paper_id, references)

# Step 2: Export graph from Neo4j to CSV
def export_to_csv():
    # Exporting Paper nodes to CSV
    query = "MATCH (p:Paper) RETURN p.id AS paper_id"
    with driver.session() as session:
        result = session.run(query)
        papers = pd.DataFrame([record['paper_id'] for record in result], columns=['paper_id'])
        papers.to_csv('papers.csv', index=False)

    # Exporting Citation edges to CSV
    query = "MATCH (p:Paper)-[:CITES]->(r:Paper) RETURN p.id AS citing_paper, r.id AS cited_paper"
    with driver.session() as session:
        result = session.run(query)
        citations = pd.DataFrame([(record['citing_paper'], record['cited_paper']) for record in result], columns=['citing_paper', 'cited_paper'])
        citations.to_csv('citations.csv', index=False)

# Export data to CSV
export_to_csv()

# Step 3: Initialize Spark session
spark = SparkSession.builder \
    .appName("Citation Graph Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Load citation graph into Spark
citations_df = spark.read.csv('citations.csv', header=True, inferSchema=True)
papers_df = spark.read.csv('papers.csv', header=True, inferSchema=True)

# Convert to GraphFrame
vertices = papers_df.selectExpr("paper_id as id")
edges = citations_df.selectExpr("citing_paper as src", "cited_paper as dst")

# Create GraphFrame
g = GraphFrame(vertices, edges)

# Step 4: SimRank Algorithm in Spark (without NetworkX)
def simrank_algorithm(graph, source, target, C=0.8, max_iter=1000, tol=0.0001):
    """
    Function to compute SimRank similarity between two nodes using Apache Spark
    """
    # Initialize a dictionary to store similarities for each node
    sim_cache = {}
    
    def recursive_similarity(u, v):
        if (u, v) in sim_cache:
            return sim_cache[(u, v)]
        
        # Get in-neighbors for both nodes
        in_neighbors_u = [row['src'] for row in graph.edges.filter(graph.edges.dst == u).collect()]
        in_neighbors_v = [row['src'] for row in graph.edges.filter(graph.edges.dst == v).collect()]
        
        if not in_neighbors_u or not in_neighbors_v:
            sim_cache[(u, v)] = 0
            return 0
        
        # Calculate the similarity based on neighbors
        scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) if len(in_neighbors_u) > 0 and len(in_neighbors_v) > 0 else 0
        similarity = scale * sum(recursive_similarity(w, x) for w, x in product(in_neighbors_u, in_neighbors_v))
        
        sim_cache[(u, v)] = similarity
        return similarity
    
    return recursive_similarity(source, target)

# Step 5: Run SimRank for query nodes with different C values
def run_simrank():
    # Query nodes
    query_nodes = [2982615777, 1556418098]
    
    # Results for different C values
    results = {}
    for C in tqdm([0.7, 0.8, 0.9], desc="Running SimRank for different values of C", ncols=100):
        for query_node in query_nodes:
            similar_nodes = {}
            for row in g.vertices.collect():  # Loop through all vertices (papers)
                node = row['id']
                similarity = simrank_algorithm(g, query_node, node, C=C)
                similar_nodes[node] = similarity
            results[C] = similar_nodes
    
    return results

# Run SimRank and get the results
simrank_results = run_simrank()

# Step 6: Output results
for C, similar_nodes in simrank_results.items():
    print(f"SimRank results for C={C}:")
    sorted_similar_nodes = sorted(similar_nodes.items(), key=lambda x: x[1], reverse=True)
    for node, sim in sorted_similar_nodes[:10]:  # Top 10 most similar nodes
        print(f"Node {node} -> Similarity: {sim}")



24/11/15 14:26:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Py4JJavaError: An error occurred while calling o11072.loadClass.
: java.lang.ClassNotFoundException: org.graphframes.GraphFramePythonAPI
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
