In [81]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("Python Spark create RDD example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
)

In [78]:

sc = spark.sparkContext
rdd_lines = sc.textFile("employees.csv")
header = rdd_lines.first()
data_rdd = rdd_lines.filter(lambda line: line != header)

def parse(line):
    parts = line.split(",")
    return (int(parts[0]), parts[1], parts[2], int(parts[3]), int(parts[4]), parts[5])

employees_rdd = data_rdd.map(parse)
all_employees = employees_rdd.collect()
print(employees_rdd.toDebugString()) # will print the parent RDDs and partitioning info.
all_employees


b'(2) PythonRDD[128] at collect at /tmp/ipykernel_87469/3632553374.py:11 []\n |  employees.csv MapPartitionsRDD[126] at textFile at <unknown>:0 []\n |  employees.csv HadoopRDD[125] at textFile at <unknown>:0 []'


[(1, 'Alice', 'Engineering', 95000, 29, 'Boston'),
 (2, 'Bob', 'Engineering', 88000, 33, 'Seattle'),
 (3, 'Carol', 'HR', 72000, 41, 'New York'),
 (4, 'David', 'Finance', 85000, 38, 'Chicago'),
 (5, 'Eve', 'Engineering', 99000, 27, 'Austin'),
 (6, 'Frank', 'HR', 68000, 45, 'Boston'),
 (7, 'Grace', 'Finance', 79000, 31, 'New York'),
 (8, 'Heidi', 'Engineering', 92000, 35, 'Seattle'),
 (9, 'Ivan', 'Marketing', 66000, 30, 'Chicago'),
 (10, 'Judy', 'Marketing', 71000, 28, 'Austin'),
 (11, 'Mallory', 'Finance', 81000, 42, 'Boston'),
 (12, 'Oscar', 'HR', 70000, 36, 'Seattle')]

In [53]:
eng_salaries_rdd = employees_rdd.filter(lambda t: t[2] == "Engineering").map(lambda t: t[3])
total_eng_salary = eng_salaries_rdd.sum()
print(total_eng_salary)

374000


In [54]:
import os
print(os.getcwd())

/home/developer/Workspace_Projects/Data_Engineer/PySpark


In [55]:
sc = spark.sparkContext
r1 = sc.textFile("employees.csv")
r2 = r1.filter(lambda line: "Engineering" in line)
r3 = r2.map(parse)                      
r4 = r3.map(lambda t: (t[2], int(t[3]))) 
r5 = r4.reduceByKey(lambda a,b: a+b)         
result = r5.collect()
result

[('Engineering', 374000)]

In [56]:
#  parallelize() → creates an RDD of type ParallelCollectionRDD

sc = spark.sparkContext

data = [("Alice", 95000), ("Bob", 88000), ("Eve", 99000)]
rdd = sc.parallelize(data, numSlices=200)

print(rdd.collect())
print("Partitions:", rdd.getNumPartitions())


[('Alice', 95000), ('Bob', 88000), ('Eve', 99000)]
Partitions: 200


In [57]:
rdd2 = rdd.map(lambda x: (x[0], x[1] + 5000))
print(rdd2.collect())




[('Alice', 100000), ('Bob', 93000), ('Eve', 104000)]


                                                                                

- You define transformations → Spark creates a logical plan (lineage graph).

- No work happens yet — just metadata.
 
- You trigger an action → Spark’s DAG Scheduler:
 
- Divides operations into stages.
 
- Launches tasks on executors.
 
- Performs narrow and wide shuffles if needed.
 
- Result is returned to driver or saved to disk.

| Type      | Description                                                                       | Examples                      |
|-----------|-----------------------------------------------------------------------------------|--------                       |
|`Narrow`	|Each partition of parent RDD is used by at most one child partition (no shuffle).	|map, filter, flatMap
|`Wide`	    |Data from multiple parent partitions needed for one child partition → causes shuffle.	|reduceByKey, groupByKey, join

In [89]:

rdd = sc.textFile("students.csv")

header = rdd.first()
data_rdd = rdd.filter(lambda line: line != header)

parsed_rdd = data_rdd.map(lambda line: line.split(","))
parsed_rdd.collect()

[['1', 'Aisha', 'Math', '90'],
 ['2', 'Raj', 'Science', '80'],
 ['3', 'Neha', 'Math', '85'],
 ['4', 'Raj', 'Math', '70'],
 ['5', 'Aisha', 'English', '88']]

In [85]:
pair_rdd = parsed_rdd.map(lambda x: (x[1], int(x[3])))
total_marks = pair_rdd.reduceByKey(lambda a, b: a+b)
total_marks.collect()

[('Aisha', 178), ('Raj', 150), ('Neha', 85)]

In [90]:
grouped_marks = pair_rdd.groupByKey()
print([(name, list(marks)) for name, marks in grouped_marks.collect()])

[('Aisha', [90, 88]), ('Raj', [80, 70]), ('Neha', [85])]


In [95]:
# flatMap(func) is like map(), but each input item can be mapped to 0 or more output items.

subjects_rdd = parsed_rdd.map(lambda x: x[2])  # ['Math', 'Science', 'Math', 'Math', 'English']
flat_rdd = subjects_rdd.flatMap(lambda x: x.split(","))
print(flat_rdd.collect())


['Math', 'Science', 'Math', 'Math', 'English']


In [91]:

city_rdd = sc.parallelize([("Aisha", "Delhi"), ("Raj", "Mumbai"), ("Neha", "Pune")])

joined = pair_rdd.join(city_rdd)
print(joined.collect())


[('Raj', (80, 'Mumbai')), ('Raj', (70, 'Mumbai')), ('Aisha', (90, 'Delhi')), ('Aisha', (88, 'Delhi')), ('Neha', (85, 'Pune'))]


In [96]:
count = parsed_rdd.count()
print("Total records:", count)


Total records: 5


In [97]:
first_two = parsed_rdd.take(2)
print(first_two)


[['1', 'Aisha', 'Math', '90'], ['2', 'Raj', 'Science', '80']]


In [98]:
marks_rdd = parsed_rdd.map(lambda x: int(x[3]))
total_marks = marks_rdd.reduce(lambda a, b: a + b)
print("Total marks:", total_marks)


Total marks: 413


In [None]:
pair_rdd = parsed_rdd.map(lambda x: (x[1], int(x[3])))
pair_rdd.saveAsTextFile("output/student_marks")


In [108]:
pair_rdd.foreach(lambda x: print(f"Student: {x[0]}, Marks: {x[1]}"))


Student: Neha, Marks: 85
Student: Raj, Marks: 70
Student: Aisha, Marks: 88
Student: Aisha, Marks: 90
Student: Raj, Marks: 80


What is a **Lineage Graph**?

Every RDD in Spark remembers how it was derived from other RDDs.

This chain of dependencies between RDDs forms a **Lineage Graph** (also called DAG — Directed Acyclic Graph).

➡️ Instead of storing every intermediate dataset, Spark keeps:

-The sequence of transformations (map, filter, reduceByKey, etc.)

-The original data source path

-The functions applied

So, if a partition of data is lost (say, due to node failure), Spark can recompute it by replaying the transformations from the original source.

In [136]:
import time
start = time.time()
for _ in range(3):
    parsed_rdd.filter(lambda x: int(x[3]) > 80).count()
print("Without cache:", time.time() - start)


Without cache: 0.24838852882385254


In [144]:
cached_rdd = parsed_rdd.filter(lambda x: int(x[3]) > 80).cache()
cached_rdd.count()  # first computation (cached)
start = time.time()
for _ in range(3):
    cached_rdd.count()
print("With cache:", time.time() - start)


With cache: 0.19412803649902344
