<a href="https://colab.research.google.com/github/Pere91/SAC_Indirect/blob/main/SAC_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
!apt-get update -qq
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.5.7/spark-3.5.7-bin-hadoop3.tgz
!tar xf spark-3.5.7-bin-hadoop3.tgz

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [110]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.7-bin-hadoop3"
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["SPARK_HOME"], "bin")

In [111]:
from pyspark import SparkConf, SparkContext
import networkx as nx

In [112]:
conf = SparkConf().setMaster("local").setAppName("SAC_Spark")
sc = SparkContext.getOrCreate(conf=conf)

In [113]:
G = nx.DiGraph()
G.add_weighted_edges_from([
    ("A", "C", 3.0), ("A", "F", 2.0),
    ("C", "D", 4.0), ("C", "F", 2.0), ("C", "E", 1.0),
    ("F", "E", 3.0), ("F", "G", 5.0), ("F", "B", 6.0),
    ("E", "B", 2.0),
    ("D", "B", 1.0),
    ("A", "C", 3.0),
    ("G", "B", 2.0)
])

In [114]:
def get_neighbors(node):
  return node[1][0]

In [115]:
def mark_visited(node):
  return (node[0], (node[1][0], node[1][1], True, node[1][3]))

In [127]:
def generate_path(src, dst):
  return src[1][3] + [dst[0]]

In [116]:
INIT_NODE = 'A'
pyspark_graph = []
graph_dict = {}

for node in G.nodes():
  neighbors = [(nbr, G.edges[node, nbr]["weight"]) for nbr in G.successors(node)]

  if node == INIT_NODE:
    weight = 0
    path = [INIT_NODE]
  else:
    weight = float("inf")
    path = []

  pyspark_graph.append((node, (neighbors, weight, False, path)))
  graph_dict[node] = (neighbors, weight, False, path)

In [117]:
for city in pyspark_graph:
  print(city)

('A', ([('C', 3.0), ('F', 2.0)], 0, False, ['A']))
('C', ([('D', 4.0), ('F', 2.0), ('E', 1.0)], inf, False, []))
('F', ([('E', 3.0), ('G', 5.0), ('B', 6.0)], inf, False, []))
('D', ([('B', 1.0)], inf, False, []))
('E', ([('B', 2.0)], inf, False, []))
('G', ([('B', 2.0)], inf, False, []))
('B', ([], inf, False, []))


In [118]:
# Select starting node
vertices = sc.parallelize(pyspark_graph)
start_node = vertices.filter(lambda x: x[0] == INIT_NODE).collect()[0]
start_node

('A', ([('C', 3.0), ('F', 2.0)], 0, False, ['A']))

In [119]:
# Mark starting node as visited
start_node = mark_visited(start_node)
start_node

('A', ([('C', 3.0), ('F', 2.0)], 0, True, ['A']))

In [120]:
# Get starting node neighbors
neighbor_list = get_neighbors(start_node)
neighbor_dict = dict(neighbor_list)
neighbors = vertices.filter(lambda x: x[0] in [n[0] for n in neighbor_list]).collect()
neighbors

[('C', ([('D', 4.0), ('F', 2.0), ('E', 1.0)], inf, False, [])),
 ('F', ([('E', 3.0), ('G', 5.0), ('B', 6.0)], inf, False, []))]

In [121]:
# Select neighbors that must be updated
neighbors = sc.parallelize(neighbors)
to_be_updated = neighbors.filter(lambda x: min(neighbor_dict[x[0]], x[1][1]) == neighbor_dict[x[0]]).collect()
to_be_updated


[('C', ([('D', 4.0), ('F', 2.0), ('E', 1.0)], inf, False, [])),
 ('F', ([('E', 3.0), ('G', 5.0), ('B', 6.0)], inf, False, []))]

In [128]:
# Update path costs
neighbors = sc.parallelize(to_be_updated)
updated_neighbors = neighbors.map(lambda x: (x[0], (x[1][0], neighbor_dict[x[0]], x[1][2], generate_path(start_node, x)))).collect()
updated_neighbors

[('C', ([('D', 4.0), ('F', 2.0), ('E', 1.0)], 3.0, False, ['A', 'C'])),
 ('F', ([('E', 3.0), ('G', 5.0), ('B', 6.0)], 2.0, False, ['A', 'F']))]