In [2]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
!pip install pyspark

# Install GraphFrames
!pip install graphframes

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=e5b453538524200182479f8c51771ee9d8f9f133131d268ed8a0141d39b0b9e4
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3
Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-n

In [7]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, col, lit

# Create SparkSession
spark = SparkSession.builder \
    .appName("Airline Routes Analysis") \
    .getOrCreate()

# Read data from airline_routes.csv
df = spark.read.csv('/content/airline_routes.csv', header=True, inferSchema=True)

# Show the DataFrame
df.show(5)

# Create Vertices DataFrame using source_airport as the id column
vertices = df.select("source_airport").withColumnRenamed("source_airport", "id").distinct()

# Create Edge DataFrame using source_airport as src and destination_airport as dst
edges = df.select("source_airport", "destination_airport") \
    .withColumnRenamed("source_airport", "src") \
    .withColumnRenamed("destination_airport", "dst")

# Show Vertices DataFrame
print("Vertices DataFrame:")
vertices.show()

# Show Edge DataFrame
print("Edges DataFrame:")
edges.show()

# Group the edges based on the src and dst where the count must be more than 5
edges_grouped = edges.groupBy("src", "dst") \
    .count() \
    .filter("count > 5") \
    .orderBy(desc("count")) \
    .withColumn("source_color", lit("#3358FF")) \
    .withColumn("destination_color", lit("#FF3F33"))

# Show grouped data
print("Grouped Edges DataFrame:")
edges_grouped.show()

# Write data into new_data.csv using the overwrite mode and set the header to True
edges_grouped.write.mode("overwrite").option("header", True).csv('/content/new_data.csv')

# Stop the Spark session
spark.stop()


+-------+----------+--------------+-----------------+-------------------+----------------------+---------+------+---------+
|airline|airline ID|source_airport|source_airport_id|destination_airport|destination_airport_id|codeshare| stops|equipment|
+-------+----------+--------------+-----------------+-------------------+----------------------+---------+------+---------+
|     5T|      1623|           YRT|              132|                YEK|                    50|     NULL|     1|      ATR|
|     AC|       330|           ABJ|              253|                BRU|                   302|     NULL|     1|      333|
|     AC|       330|           YVR|              156|                YBL|                    30|     NULL|     1|      BEH|
|     CU|      1936|           FCO|             1555|                HAV|                  1909|     NULL|     1|      767|
|     FL|      1316|           HOU|             3566|                SAT|                  3621|     NULL|     1|      735|
+-------