In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RDDEamples").getOrCreate()
spark
sc = spark.sparkContext

In [3]:
rdd = sc.textFile("Flight_Schedule.csv")
header = rdd.first()
rdd_no_header = rdd.filter(lambda line: line != header)

def parse_flight(line):
    fields = line.split(",")
    return (fields[0],fields[1],fields[2],fields[3])

flights_rdd = rdd_no_header.map(parse_flight)
print("flights_rdd",flights_rdd)

delhi_flights = flights_rdd.filter(lambda x:x[2] == "Delhi")

airlines = delhi_flights.map(lambda x:x[1]).distinct().collect()
print(airlines)

flights_rdd PythonRDD[6] at RDD at PythonRDD.scala:53
['AirAsia India', 'Jet Airways', 'TestIndigo', 'Vistara', '', 'GoAir', 'Air India', 'Jetlite', 'SpiceJet']


In [4]:
null_arrival_count = sc.accumulator(0)

def count_nulls(line):
    fields = line.split(",")
    if len(fields) > 6 and fields[6] == "":
        null_arrival_count.add(1)

rdd_no_header.foreach(count_nulls)
print(f"Flights with null scheduledArrivalTime: {null_arrival_count.value}")

Flights with null scheduledArrivalTime: 4068


In [5]:
# List of airlines
target_airlines = ["GoAir", "SpiceJet"]

# Broadcast it
broadcast_airlines = sc.broadcast(target_airlines)

# Filter flights
filtered_flights = flights_rdd.filter(lambda x: x[1] in broadcast_airlines.value)

# Show first 2
for flight in filtered_flights.take(2):
    print(flight)

('425', 'GoAir', 'Delhi', 'Hyderabad')
('423', 'GoAir', 'Delhi', 'Hyderabad')


In [2]:
rdd = sc.textFile("Flight_Schedule.csv")

header = rdd.first()
rdd_no_header = rdd.filter(lambda line: line != header)

rdd_pairs = rdd_no_header.map(lambda line: (line.split(",")[1], line))

def sort_airline(airline):
    return hash(airline) % 4

sorted_rdd = rdd_pairs.partitionBy(4, sort_airline)

def count_papers(box_number, papers):
    return [(box_number, sum(1 for _ in papers))]

counts = sorted_rdd.mapPartitionsWithIndex(count_papers).collect()
print("Papers in each box:", counts)

# Save to see the groups
sorted_rdd.map(lambda x: x[1]).saveAsTextFile("rddoutput")

Papers in each box: [(0, 6002), (1, 4489), (2, 15823), (3, 8011)]


In [3]:
df = spark.read.option("header", "true").csv("Flight_Schedule.csv")
df.write.partitionBy("airline").parquet("output")