## Install dependepncies

In [1]:
!pip install pyspark
!pip install graphframes



In [2]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder with Delta extensions and set extra packages for GraphFrames
builder = pyspark.sql.SparkSession.builder.appName("project3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Start Spark with the extra GraphFrames package (version must match your Spark/Scala version)
spark = configure_spark_with_delta_pip(builder, extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

## Load the data

In [3]:
df = spark.read.option("header", "true").csv("input/2009.csv")
df.printSchema()

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: string (nullable = true)
 |-- CRS_ELAPSED_TIME: string (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: strin

## Data Cleansing

In [4]:
from pyspark.sql.functions import col

# Cast CANCELLED to float
df = df.withColumn("CANCELLED", col("CANCELLED").cast("float"))

# Filter on 0.0
df_clean = (
    df
    .filter(col("CANCELLED") == 0.0)
    .dropna(subset=["ORIGIN", "DEST"])
    .filter(col("ORIGIN") != col("DEST"))
)
df_clean.show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|

### Create Graph

In [5]:
from graphframes import GraphFrame

vertices = df_clean.select(col("ORIGIN").alias("id")) \
    .union(df_clean.select(col("DEST").alias("id"))) \
    .distinct()

display(vertices)

id
DCA
ABQ
LBB
PVD
AVL
DSM
XNA
SYR
CAE
FAT


In [6]:
edges = df_clean.select(
    col("ORIGIN").alias("src"),
    col("DEST").alias("dst")
)

display(edges)

src,dst
DCA,EWR
EWR,IAD
EWR,DCA
DCA,EWR
IAD,EWR
ATL,EWR
CLE,ATL
DCA,EWR
EWR,DCA
EWR,DCA


In [7]:
graph = GraphFrame(vertices, edges)

vertices.cache()
edges.cache()

display(graph)

GraphFrame(v:[id: string], e:[src: string, dst: string])

## Query 1: Compute different statistics

In [8]:
from pyspark.sql.functions import count

# Compute out-degree: count flights leaving each airport.
out_degree = edges.groupBy("src").agg(count("*").alias("out_degree")) \
                  .withColumnRenamed("src", "id")

# Compute in-degree: count flights arriving at each airport.
in_degree = edges.groupBy("dst").agg(count("*").alias("in_degree")) \
                 .withColumnRenamed("dst", "id")

# Combine the results with vertices to include all airports,
degree_df = vertices.join(in_degree, on="id", how="left") \
                    .join(out_degree, on="id", how="left") \
                    .na.fill(0) \
                    .withColumn("total_degree", col("in_degree") + col("out_degree"))

# Display the degree statistics
display(degree_df)

id,in_degree,out_degree,total_degree
DCA,78016,77900,155916
ABQ,35419,35393,70812
LBB,7891,7873,15764
PVD,18112,18090,36202
AVL,4493,4470,8963
DSM,14787,14729,29516
XNA,13409,13343,26752
SYR,9207,9189,18396
CAE,9803,9796,19599
FAT,12255,12231,24486


### Triangle count

In [9]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col

# Calculate triangle count (undirected):
def triangleCount(graph):
    edges = graph.edges.select(F.least("src", "dst").alias("src"), F.greatest("src", "dst").alias("dst")).distinct()

    # 1. Find neighbors for each node.
    neighbors = edges.groupBy("src").agg(F.collect_set("dst").alias("neighbors"))

    # 2. Join neighbors with edges to find pairs of neighbors for a node.
    neighbor_pairs = neighbors.alias("n1").join(edges.alias("e"), F.col("n1.src") == F.col("e.src")) \
        .join(neighbors.alias("n2"), F.col("e.dst") == F.col("n2.src")) \
        .select(F.col("n1.src").alias("node1"), F.col("n1.neighbors").alias("neighbors1"),
                F.col("n2.src").alias("node2"), F.col("n2.neighbors").alias("neighbors2"))

    # 3. Find intersections of neighbor sets and count.
    triangles = neighbor_pairs.withColumn("intersection_size", F.size(F.array_intersect("neighbors1", "neighbors2"))) \
        .select("node1", "node2", "intersection_size")

    # 4. Group by node1 to calculate total triangles.
    triangle_counts = triangles.groupBy("node1").agg(F.sum("intersection_size").alias("total_triangles")) \
        .withColumn("triangleCount", F.expr("total_triangles / 2")) \
        .select(F.col("node1").alias("vertex"), "triangleCount")

    return triangle_counts

triangle_counts = triangleCount(graph)
triangle_counts.show()

+------+-------------+
|vertex|triangleCount|
+------+-------------+
|   AVL|         10.0|
|   CAE|         17.5|
|   SAN|          8.0|
|   ABQ|        155.5|
|   DCA|        205.5|
|   AZO|          3.0|
|   DSM|         23.5|
|   ANC|         61.0|
|   FAT|          6.0|
|   LNK|          2.5|
|   HPN|          4.0|
|   MBS|          0.5|
|   BGM|          0.0|
|   LBB|          0.0|
|   CDV|          0.0|
|   BJI|          0.0|
|   DTW|        347.5|
|   LGA|         63.5|
|   BOI|         38.5|
|   AUS|        193.0|
+------+-------------+
only showing top 20 rows



## Query 2: Total number of triangles

In [10]:
total_triangle_count = triangle_counts.agg(F.sum("triangleCount").alias("total")).collect()[0]["total"]
print("Total number of triangles:", total_triangle_count)

Total number of triangles: 7995.5


## Query 3: Closeness Centrality

In [8]:
from pyspark.sql.functions import lit, col, when
from pyspark.sql import DataFrame

def compute_closeness_centrality(vertices, edges, max_depth=10):
    closeness_scores = []

    vertex_ids = [row["id"] for row in vertices.select("id").collect()]

    for source in vertex_ids:
        print(f"Processing: {source}")  # debug log

        visited = vertices.withColumn("distance", when(col("id") == source, lit(0)))
        frontier = visited.filter(col("id") == source)

        depth = 0
        while frontier.count() > 0 and depth < max_depth:
            print(f"Depth: {depth}, Frontier size: {frontier.count()}")  # debug log

            next_frontier = frontier.join(edges, frontier["id"] == edges["src"]) \
                                    .select(edges["dst"].alias("id")) \
                                    .distinct()

            next_frontier = next_frontier.join(visited, "id", "left_anti") \
                                         .withColumn("distance", lit(depth + 1))

            visited = visited.union(next_frontier).distinct()
            frontier = next_frontier
            depth += 1

        reachable = visited.filter((col("id") != source) & col("distance").isNotNull())
        reachable_count = reachable.count()

        if reachable_count > 0:
            total_distance = reachable.agg({"distance": "sum"}).collect()[0][0]
            avg_distance = total_distance / reachable_count
            closeness = 1 / avg_distance if avg_distance > 0 else 0
        else:
            closeness = 0

        closeness_scores.append((source, closeness))

    return spark.createDataFrame(closeness_scores, ["id", "closeness_centrality"])


In [9]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

import gc
import time

def compute_all_closeness_in_batches(vertices: DataFrame, edges: DataFrame, batch_size: int = 10, max_depth: int = 3) -> DataFrame:
    from pyspark.sql.functions import col

    airport_ids = [row["id"] for row in vertices.collect()]
    total = len(airport_ids)
    all_results = []

    for i in range(0, total, batch_size):
        print(f"\n🧮 Batch {i // batch_size + 1}: Processing airports {i} to {min(i + batch_size, total)}...")

        batch_ids = airport_ids[i:i + batch_size]
        batch_vertices = vertices.filter(col("id").isin(batch_ids))

        # Try/catch for robustness
        try:
            batch_result = compute_closeness_centrality(batch_vertices, edges, max_depth=max_depth)
            all_results.append(batch_result)
        except Exception as e:
            print(f"❌ Error in batch {i // batch_size + 1}: {e}")
        
        # Give Spark a break (crucial in Docker)
        spark.catalog.clearCache()
        gc.collect()
        time.sleep(2)

    # Combine results safely
    if not all_results:
        return spark.createDataFrame([], schema="id STRING, closeness_centrality DOUBLE")

    combined = all_results[0]
    for df in all_results[1:]:
        combined = combined.union(df)

    return combined




In [11]:
closeness_df_batched = compute_all_closeness_in_batches(vertices.limit(5), edges, batch_size=5, max_depth=2)
display(closeness_df_batched.orderBy(col("closeness_centrality").desc()))


🧮 Batch 1: Processing airports 0 to 5...
Processing: DCA
Depth: 0, Frontier size: 1
Depth: 1, Frontier size: 47
Processing: ABQ
Depth: 0, Frontier size: 1
Depth: 1, Frontier size: 30
Processing: LBB
Depth: 0, Frontier size: 1
Depth: 1, Frontier size: 7
Processing: PVD
Depth: 0, Frontier size: 1
Depth: 1, Frontier size: 19
Processing: AVL
Depth: 0, Frontier size: 1
Depth: 1, Frontier size: 8

🧮 Batch 2: Processing airports 5 to 10...
Processing: DSM
Depth: 0, Frontier size: 1
Depth: 1, Frontier size: 16


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


❌ Error in batch 2: An error occurred while calling o939.count


ConnectionRefusedError: [Errno 111] Connection refused

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [15]:
print("Total edges:", edges.count())
edges.show(5)


Total edges: 6342300
+---+---+
|src|dst|
+---+---+
|DCA|EWR|
|EWR|IAD|
|EWR|DCA|
|DCA|EWR|
|IAD|EWR|
+---+---+
only showing top 5 rows



In [17]:
from pyspark.sql.functions import length, trim

# Check for edge values not in vertices
edges_not_in_vertices = edges.filter(~col("src").isin([row["id"] for row in vertices.collect()]) |
                                     ~col("dst").isin([row["id"] for row in vertices.collect()]))

print("Edges with unmatched nodes:", edges_not_in_vertices.count())
edges_not_in_vertices.show(5)

Edges with unmatched nodes: 0
+---+---+
|src|dst|
+---+---+
+---+---+



In [18]:
from collections import deque

airport = "ATL"  # a well-connected airport

frontier = deque([airport])
visited = set()
depth = 0
max_depth = 3

while frontier and depth < max_depth:
    next_level = deque()
    for node in frontier:
        neighbors = edges.filter(edges["src"] == node).select("dst").rdd.flatMap(lambda x: x).collect()
        for n in neighbors:
            if n not in visited:
                next_level.append(n)
                visited.add(n)
    frontier = next_level
    depth += 1

print(f"From {airport}, reachable nodes within depth {max_depth}: {len(visited)}")

From ATL, reachable nodes within depth 3: 296


In [23]:
del count

NameError: name 'count' is not defined