In [1]:
!pip install pyspark
!pip install neo4j
!pip install pandas

import pandas as pd

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=371be37e2bb83bb4d9d78454353b705ba3c6cf58c53b7c8f09c118d9f550055b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Collecting neo4j
  Downloading neo4j-5.20.0.tar.gz (202 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.0/203.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wh

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Ejemplo de PySpark en Jupyter Notebook") \
    .getOrCreate()

# Obtener el SparkContext
sc = spark.sparkContext

# Parte 2
## Problemas a resolver

### Funciones para implementar PySpark

In [3]:
def hash(x, B):
    return x % B

In [4]:
def map_pdm(x, arista, y, B):

    x_hash = hash(x, B)
    y_hash = hash(y, B)

    l = []
    for i in range(B):
        l.append(((x_hash, y_hash, i), (x, arista, y)))
        l.append(((i, x_hash, y_hash), (x, arista, y)))
        l.append(((y_hash, i, x_hash), (x, arista, y)))

    return set(l)

In [5]:
def triangulo(nodos):

  triangulos_detectados = []
  for i in range(len(nodos)):
    nodo_actual = nodos[i]

    for j in range(i + 1, len(nodos)):

      nodo_sig = nodos[j]

      for k in range(j + 1, len(nodos)):
        nodo_sub_sig = nodos[k]

        if nodo_actual[2] == nodo_sig[0] and nodo_sig[2] == nodo_sub_sig[0] and nodo_actual[0] == nodo_sub_sig[2]:
          triangulos_detectados.append((nodo_actual[0], nodo_sig[0], nodo_sub_sig[0]))

        elif nodo_actual[2] == nodo_sub_sig[0] and nodo_sig[0] == nodo_sub_sig[2] and nodo_actual[0] == nodo_sig[2]:
          triangulos_detectados.append((nodo_actual[0], nodo_sig[0], nodo_sub_sig[0]))

        elif (nodo_actual[0] == nodo_sub_sig[0] and nodo_actual[2] == nodo_sig[2] and nodo_sub_sig[2] == nodo_sig[0]):
          triangulos_detectados.append((nodo_actual[0], nodo_sig[0], nodo_sub_sig[0]))
          
  return triangulos_detectados

*Conección a Neo4j*

In [6]:
from neo4j import GraphDatabase

URI = "neo4j+s://06ae1fa1.databases.neo4j.io"
AUTH = ("neo4j","QjwVk3kN-OI5bTt-fg6LZy-F4LMHCpL9HFxuvSuq-OE")

driver = GraphDatabase.driver(URI, auth=AUTH)
with driver.session() as session:
    try:
        session.run("RETURN 1")
        print("Connection to Neo4j established successfully!")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

Connection to Neo4j established successfully!


### Cargar el grafo




In [7]:
def get_data_from_neo4j():
    with driver.session() as session:
        result = session.run("""
        MATCH (n1:Node)-[r:RELATED]->(n2:Node)
        RETURN n1.id AS id_form, n2.id AS id_to, r.weight AS weight
        """)
        data = [record.data() for record in result]
        tuples = [(d['id_form'], d['weight'], d['id_to']) for d in data]
    return tuples

# Obtiene los datos de Neo4j
neo4j_data = get_data_from_neo4j()

In [8]:
# Neo4j a RDD
rdd = sc.parallelize(neo4j_data)

In [9]:
rdd.take(5)

[(1, 1, 2), (1, 7, 3), (4, 3, 5), (6, 5, 7), (6, 14137, 8)]

Función para la busqueda de triángulos

In [10]:
# toma el rdd y un b para los buckets de la función de hash y
# devuelve un diccionario con las claves originales y los triángulos detectados.
def buscar_triangulos(rdd, b):
    rdd_neo4j_map = rdd.flatMap(lambda dato: map_pdm(*dato, b))
    reduce_neo4j = rdd_neo4j_map.groupByKey().mapValues(list)
    trangulos = reduce_neo4j.map(lambda nodos: (nodos[0], triangulo(nodos[1])))

    dicc = {}
    for i in trangulos.collect():
        dicc[i[0]] = i[1]

    return dicc


In [11]:
#buckets
B = 100

In [12]:
test = sc.parallelize(neo4j_data)

In [13]:
triangulos_detectados = buscar_triangulos(test, B)

In [17]:
list(triangulos_detectados.items())[:5]

[((7, 76, 6), []),
 ((61, 7, 57), []),
 ((7, 10, 58), []),
 ((9, 7, 13), [(407, 409, 413)]),
 ((64, 24, 27), [(124, 127, 164)])]

In [18]:
driver.close()