In [0]:
from graphframes import GraphFrame
from pyspark.sql.functions import desc

In [0]:
flights = spark.read.option("header","true").csv("dbfs:/FileStore/shared_uploads/tingfangwang12@gmail.com/T_T100D_SEGMENT_US_CARRIER_ONLY.csv")
flights.show(5)

+------+-----------------+----+-----------------+
|ORIGIN| ORIGIN_CITY_NAME|DEST|   DEST_CITY_NAME|
+------+-----------------+----+-----------------+
|   06A|     Kizhuyak, AK| A43|Kodiak Island, AK|
|   06A|     Kizhuyak, AK| A43|Kodiak Island, AK|
|   1G4|Peach Springs, AZ| BLD| Boulder City, NV|
|   1G4|Peach Springs, AZ| BLD| Boulder City, NV|
|   1NY|     Penn Yan, NY| ANP|    Annapolis, MD|
+------+-----------------+----+-----------------+
only showing top 5 rows



In [0]:
# Extract airports and routes form the data
# vertices
airports = flights.select("ORIGIN", "ORIGIN_CITY_NAME").toDF("id", "name").distinct()

# edges
airportEdges = flights.select("ORIGIN", "DEST").toDF("src","dst")

In [0]:
# Create the graph
airportGraph = GraphFrame(airports, airportEdges)
# save the graph in memory
airportGraph.cache()

GraphFrame(v:[id: string, name: string], e:[src: string, dst: string])

In [0]:
# Find the top 5 nodes with the highest outdegree and find the count of the number of outgoing edges in each
airportGraph.outDegrees.orderBy(desc("outDegree")).show(5)

+---+---------+
| id|outDegree|
+---+---------+
|ORD|     1146|
|DEN|     1080|
|ATL|      843|
|DFW|      754|
|LAS|      737|
+---+---------+
only showing top 5 rows



In [0]:
# Find the top 5 nodes with the highest indegree and find the count of the number of incoming edges in each
airportGraph.inDegrees.orderBy(desc("inDegree")).show(5)

+---+--------+
| id|inDegree|
+---+--------+
|ORD|    1156|
|DEN|    1082|
|ATL|     842|
|DFW|     768|
|LAS|     734|
+---+--------+
only showing top 5 rows



In [0]:
# Calculate PageRank for each of the nodes and output the top 5 nodes with the highest PageRank
ranks = airportGraph.pageRank(resetProbability=0.15, maxIter=5)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(5)

+---+------------------+
| id|          pagerank|
+---+------------------+
|DEN| 19.68945365287849|
|ORD|18.735132643224823|
|DFW|13.689951530723238|
|ATL|13.297504852945211|
|ANC|12.812466462491926|
+---+------------------+
only showing top 5 rows



In [0]:
# Run the strongly connected components algorithm on it and find the top 5 components with the largest number of nodes.
# strongly connected components
sc = spark.sparkContext
sc.setCheckpointDir("/tmp")

# Define the size of the sample (number of vertices)
sample_size = 500

# Get a random sample of vertices from the original graph
sampled_vertices = airportGraph.vertices.limit(sample_size)

# Collect the IDs of the sampled vertices
sampled_vertex_ids = [row['id'] for row in sampled_vertices.collect()]

# Filter edges to keep only those connecting sampled vertices
edges = airportGraph.edges
sampled_edges = edges.filter(edges['src'].isin(sampled_vertex_ids) | edges['dst'].isin(sampled_vertex_ids))

# Create a graph from the sampled vertices and edges
sampled_graph = GraphFrame(sampled_vertices, sampled_edges)

# Get the strongly connected components
result = sampled_graph.stronglyConnectedComponents(maxIter = 5)

# Extract the sizes of each connected component, count: number of nodes in each component
component_sizes = result.groupby("component").count()

# Sort the component sizes in descending order to check the top 5 components with largest number of vertices
top_components = component_sizes.orderBy(desc("count"))
top_components.show(5)

+------------+-----+
|   component|count|
+------------+-----+
|           0|  398|
|           1|   11|
|           2|    3|
|644245094400|    3|
| 25769803778|    3|
+------------+-----+
only showing top 5 rows



In [0]:
# Run the triangle counts algorithm on each of the vertices and output the top 5 vertices with the largest triangle count.
results = airportGraph.triangleCount()
results.select("id", "name", "count").orderBy(desc("count")).show(5)

+---+--------------------+-----+
| id|                name|count|
+---+--------------------+-----+
|DFW|Dallas/Fort Worth...| 3258|
|ATL|         Atlanta, GA| 3115|
|ORD|         Chicago, IL| 3070|
|DEN|          Denver, CO| 2968|
|LAS|       Las Vegas, NV| 2956|
+---+--------------------+-----+
only showing top 5 rows

