In [2]:
!pip install pyspark
!pip install neo4j
!pip install pandas

import pandas as pd



In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Ejemplo de PySpark en Jupyter Notebook") \
    .getOrCreate()

# Obtener el SparkContext
sc = spark.sparkContext

In [4]:
graph = [(1,11,2),(1,11,3),(2,11,3),(3,11,2),(3,11,4),(4,11,1),(4,11,2),(4,11,3),(4,12,5),(5,12,1),(5,12,2),(5,12,6)]
#graph = [(2,11,2), (1,11,2), (2,11,1)]

In [5]:
B = 2

In [6]:
def hash(x):
    return x % B

In [7]:
def map_pdm(x, arista, y):

    x_hash = hash(x)
    y_hash = hash(y)

    l = []
    for i in range(B):
        l.append(((x_hash, y_hash, i), (x, arista, y)))
        l.append(((i, x_hash, y_hash), (x, arista, y)))
        l.append(((y_hash, i, x_hash), (x, arista, y)))

    return set(l)

In [8]:
datos_rdd = sc.parallelize(graph)

### Pasarlo por map

In [9]:
rdd_map = datos_rdd.flatMap(lambda dato: map_pdm(*dato))

rdd_map.collect()[0:2]

[((1, 0, 0), (1, 11, 2)), ((0, 1, 1), (1, 11, 2))]

### Reduce


In [20]:
rdd_reduce = rdd_map.groupByKey().mapValues(list)
rdd_reduce.collect()[0]

((1, 0, 0),
 [(1, 11, 2),
  (2, 11, 3),
  (3, 11, 2),
  (3, 11, 4),
  (4, 11, 1),
  (4, 11, 2),
  (4, 11, 3),
  (4, 12, 5),
  (5, 12, 2),
  (5, 12, 6)])

In [28]:
def triangulo(nodos):

  triangulos_detectados = []
  for i in range(len(nodos)):
    nodo_actual = nodos[i]

    for j in range(i, len(nodos)):
      if j + 1 >= len(nodos):
        break
      nodo_sig = nodos[j]
      nodo_sub_sig = nodos[j + 1]


      if nodo_actual[2] == nodo_sig[0] and nodo_sig[2] == nodo_sub_sig[0] and nodo_actual[0] == nodo_sub_sig[2]:
        triangulos_detectados.append((nodo_actual[0], nodo_sig[0], nodo_sub_sig[0]))

      elif nodo_actual[2] == nodo_sub_sig[0] and nodo_sig[0] == nodo_sub_sig[2] and nodo_actual[0] == nodo_sig[2]:
        triangulos_detectados.append((nodo_actual[0], nodo_sig[0], nodo_sub_sig[0]))

  return triangulos_detectados

In [44]:
final = rdd_reduce.map(lambda nodos: (nodos[0], triangulo(nodos[1])))
final.collect()

[((1, 0, 0), []),
 ((0, 1, 0), []),
 ((0, 0, 1), []),
 ((1, 1, 1), []),
 ((0, 1, 1), [(1, 3, 4)]),
 ((1, 0, 1), [(1, 3, 4)]),
 ((1, 1, 0), [(1, 3, 4)]),
 ((0, 0, 0), [])]

In [41]:
dicc = {}
for i in final.collect():
    dicc[i[0]] = i[1]

In [46]:
dicc

{(1, 0, 0): [],
 (0, 1, 0): [],
 (0, 0, 1): [],
 (1, 1, 1): [],
 (0, 1, 1): [(1, 3, 4)],
 (1, 0, 1): [(1, 3, 4)],
 (1, 1, 0): [(1, 3, 4)],
 (0, 0, 0): []}

In [None]:
from neo4j import GraphDatabase

URI = "neo4j+s://06ae1fa1.databases.neo4j.io"
AUTH = ("neo4j","QjwVk3kN-OI5bTt-fg6LZy-F4LMHCpL9HFxuvSuq-OE")

driver = GraphDatabase.driver(URI, auth=AUTH)
with driver.session() as session:
    try:
        session.run("RETURN 1")
        print("Connection to Neo4j established successfully!")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

In [None]:
cites = pd.read_csv('cora.cites',sep='\t',header=None,names=['target','source'])

column_names = ["paper_id"] + [f"word_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv(
    'cora.content', sep="\t", names=column_names,
)
subjects = papers[["paper_id","subject"]]

In [None]:
def load_papers(session, subjects):
    query = """
    UNWIND $nodes AS row
    MERGE (p:Paper {id: row.paper_id})
    SET p.subject = row.subject;
    """
    session.run(query, nodes=subjects)

def load_relationships(session, cites):
    query = """
    UNWIND $edges AS row
    MATCH (p1:Paper {id: row.source})
    MATCH (p2:Paper {id: row.target})
    MERGE (p1)-[:CITES]->(p2);
    """
    session.run(query, edges=cites)

In [None]:
with driver.session() as session:
    load_papers(session,subjects.to_dict('records'))
    load_relationships(session,cites.to_dict('records'))
    query = "MATCH (n) RETURN n LIMIT 5"
    result = session.run(query)
    for record in result:
        print(record)

### Función a cambiar

In [None]:
def get_data_from_neo4j():
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN n.name AS name, n.age AS age")
        return [{"id": record["name"], "age": record["age"]} for record in result]

# Obtiene los datos de Neo4j
neo4j_data = get_data_from_neo4j()

# Convierte los datos en un RDD de PySpark
rdd = spark.sparkContext.parallelize(neo4j_data)