## Install dependepncies

In [1]:
!pip install pyspark
!pip install graphframes



In [2]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder with Delta extensions and set extra packages for GraphFrames
builder = pyspark.sql.SparkSession.builder.appName("project3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Start Spark with the extra GraphFrames package (version must match your Spark/Scala version)
spark = configure_spark_with_delta_pip(builder, extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

## Load the data

In [3]:
df = spark.read.option("header", "true").csv("input/2009.csv")
df.printSchema()

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: string (nullable = true)
 |-- CRS_ELAPSED_TIME: string (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: strin

## Data Cleansing

In [4]:
from pyspark.sql.functions import col

# Cast CANCELLED to float
df = df.withColumn("CANCELLED", col("CANCELLED").cast("float"))

# Filter on 0.0
df_clean = (
    df
    .filter(col("CANCELLED") == 0.0)
    .dropna(subset=["ORIGIN", "DEST"])
    .filter(col("ORIGIN") != col("DEST"))
)
df_clean.show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|

### Create Graph

In [5]:
from graphframes import GraphFrame

vertices = df_clean.select(col("ORIGIN").alias("id")) \
    .union(df_clean.select(col("DEST").alias("id"))) \
    .distinct()

display(vertices)

id
DCA
ABQ
LBB
PVD
AVL
DSM
XNA
SYR
CAE
FAT


In [6]:
edges = df_clean.select(
    col("ORIGIN").alias("src"),
    col("DEST").alias("dst")
)

display(edges)

src,dst
DCA,EWR
EWR,IAD
EWR,DCA
DCA,EWR
IAD,EWR
ATL,EWR
CLE,ATL
DCA,EWR
EWR,DCA
EWR,DCA


In [7]:
graph = GraphFrame(vertices, edges)

vertices.cache()
edges.cache()

display(graph)

GraphFrame(v:[id: string], e:[src: string, dst: string])

## Query 1: Compute different statistics

In [8]:
from pyspark.sql.functions import count

# Compute out-degree: count flights leaving each airport.
out_degree = edges.groupBy("src").agg(count("*").alias("out_degree")) \
                  .withColumnRenamed("src", "id")

# Compute in-degree: count flights arriving at each airport.
in_degree = edges.groupBy("dst").agg(count("*").alias("in_degree")) \
                 .withColumnRenamed("dst", "id")

# Combine the results with vertices to include all airports,
degree_df = vertices.join(in_degree, on="id", how="left") \
                    .join(out_degree, on="id", how="left") \
                    .na.fill(0) \
                    .withColumn("total_degree", col("in_degree") + col("out_degree"))

# Display the degree statistics
display(degree_df)

id,in_degree,out_degree,total_degree
DCA,78016,77900,155916
ABQ,35419,35393,70812
LBB,7891,7873,15764
PVD,18112,18090,36202
AVL,4493,4470,8963
DSM,14787,14729,29516
XNA,13409,13343,26752
SYR,9207,9189,18396
CAE,9803,9796,19599
FAT,12255,12231,24486


### Triangle count

In [10]:
# Calculate triangle count:
trianglecount = graph.triangleCount()
display(trianglecount)

count,id
673,DCA
311,ABQ
23,LBB
161,PVD
27,AVL
109,DSM
89,XNA
82,SYR
46,CAE
31,FAT


## Query 2: Total number of triangles

In [11]:
# Compute the total triangle count:
total_triangles = trianglecount.groupBy().sum("count").collect()[0][0] / 3
print("Total number of triangles in the graph:", total_triangles)

Total number of triangles in the graph: 15991.0
