In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Test").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x75f20021e710>


25/03/04 11:22:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


RDD

In [3]:
rdd = spark.sparkContext.parallelize([(1, "Alice", 25), (2, "Bob", 30), (3, "Charlie", 28)])
print(rdd.collect()) 

[(1, 'Alice', 25), (2, 'Bob', 30), (3, 'Charlie', 28)]


Dataframe

In [4]:
df = spark.createDataFrame([(1, "Alice", 25), (2, "Bob", 30), (3, "Charlie", 28)], ["ID", "Name", "Age"])
df.show()

+---+-------+---+
| ID|   Name|Age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 28|
+---+-------+---+



In [5]:
# converting dataframe to rdd

rdd = df.rdd
print(rdd.collect())

[Row(ID=1, Name='Alice', Age=25), Row(ID=2, Name='Bob', Age=30), Row(ID=3, Name='Charlie', Age=28)]


In [6]:
# reading from a txt file

rdd = spark.sparkContext.textFile("data.txt")
print(rdd.collect()) 

['Alice,25,New York', 'Bob,30,San Francisco', 'Charlie,28,Los Angeles']


In [7]:
rdd = spark.sparkContext.textFile("data.csv")  
print(rdd.collect())

['name,age,city', 'Alice,25,New York', 'Bob,30,San Francisco', 'Charlie,28,Los Angeles', 'David,35,Chicago', 'Eve,22,Houston']


In [8]:
rdd_split = rdd.map(lambda line: line.split(","))  
print(rdd_split.collect())  

[['name', 'age', 'city'], ['Alice', '25', 'New York'], ['Bob', '30', 'San Francisco'], ['Charlie', '28', 'Los Angeles'], ['David', '35', 'Chicago'], ['Eve', '22', 'Houston']]


In [11]:
import json
rdd = spark.sparkContext.textFile("data.json")  
json_rdd = rdd.map(lambda x: json.loads(x))  
print(json_rdd.collect())  

[{'name': 'Alice', 'age': 25, 'city': 'New York'}, {'name': 'Bob', 'age': 30, 'city': 'San Francisco'}]


In [30]:
rdd = spark.sparkContext.textFile("test.txt")
print(rdd.collect())

['Project Gutenberg’s', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Project Gutenberg’s', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Project Gutenberg’s', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll']


In [31]:
rdd2 = rdd.flatMap(lambda x: x.split(" "))
print(rdd2.collect())

['Project', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Project', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Project', 'Gutenberg

In [32]:
rdd3 = rdd2.map(lambda x: (x,1))
print(rdd3.collect())

[('Project', 1), ('Gutenberg’s', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('Project', 1), ('Gutenberg’s', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of

In [33]:
rdd4 = rdd3.reduceByKey(lambda a,b: a+b)
print(rdd4.collect())

[('Project', 3), ('Gutenberg’s', 3), ('Alice’s', 5), ('in', 5), ('Lewis', 5), ('Carroll', 5), ('is', 6), ('use', 6), ('of', 6), ('anyone', 6), ('anywhere', 6), ('at', 6), ('no', 6), ('Adventures', 5), ('Wonderland', 5), ('by', 5), ('This', 6), ('eBook', 6), ('for', 6), ('the', 6), ('cost', 6), ('and', 6), ('with', 6)]


In [34]:
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
print(rdd5.collect())

[(3, 'Project'), (3, 'Gutenberg’s'), (5, 'Alice’s'), (5, 'in'), (5, 'Lewis'), (5, 'Carroll'), (5, 'Adventures'), (5, 'Wonderland'), (5, 'by'), (6, 'is'), (6, 'use'), (6, 'of'), (6, 'anyone'), (6, 'anywhere'), (6, 'at'), (6, 'no'), (6, 'This'), (6, 'eBook'), (6, 'for'), (6, 'the'), (6, 'cost'), (6, 'and'), (6, 'with')]


In [35]:
rdd5 = rdd4.map(lambda x: (x[1], x[0])).sortByKey(ascending=False)
print(rdd5.collect())

[(6, 'is'), (6, 'use'), (6, 'of'), (6, 'anyone'), (6, 'anywhere'), (6, 'at'), (6, 'no'), (6, 'This'), (6, 'eBook'), (6, 'for'), (6, 'the'), (6, 'cost'), (6, 'and'), (6, 'with'), (5, 'Alice’s'), (5, 'in'), (5, 'Lewis'), (5, 'Carroll'), (5, 'Adventures'), (5, 'Wonderland'), (5, 'by'), (3, 'Project'), (3, 'Gutenberg’s')]


In [36]:
filtered_rdd = rdd4.filter(lambda x: x[1] > 3)
print(filtered_rdd.collect())

[('Alice’s', 5), ('in', 5), ('Lewis', 5), ('Carroll', 5), ('is', 6), ('use', 6), ('of', 6), ('anyone', 6), ('anywhere', 6), ('at', 6), ('no', 6), ('Adventures', 5), ('Wonderland', 5), ('by', 5), ('This', 6), ('eBook', 6), ('for', 6), ('the', 6), ('cost', 6), ('and', 6), ('with', 6)]


In [37]:
distinct_words = rdd2.distinct()
print(distinct_words.collect())

['Project', 'Gutenberg’s', 'Alice’s', 'in', 'Lewis', 'Carroll', 'is', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'Adventures', 'Wonderland', 'by', 'This', 'eBook', 'for', 'the', 'cost', 'and', 'with']


In [38]:
total_words = rdd2.count()
print(total_words)

125


In [39]:
print("Count : "+str(rdd5.count()))

Count : 23


In [40]:
firstRec = rdd5.first()
print("First Record : "+str(firstRec[0]) + ","+ firstRec[1])

First Record : 6,is


In [41]:
totalWordCount = rdd5.reduce(lambda a,b: (a[0]+b[0],a[1]))
print("dataReduce Record : "+str(totalWordCount[0]))

dataReduce Record : 125


In [42]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data=[("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60)]
inputRDD = spark.sparkContext.parallelize(data)
listRdd = spark.sparkContext.parallelize([1,2,3,4,5,3,2])

25/03/05 11:19:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [43]:
seqOp = (lambda x, y: x + y)
combOp = (lambda x, y: x + y)
agg=listRdd.aggregate(0, seqOp, combOp)
print(agg)

20


In [2]:
spark = SparkSession.builder.appName("RDDOperationsExample").getOrCreate()
# Employee data (ID, Name, Department, Salary, Experience in Years)
employee_data = [
    (101, "Alice", "HR", 5000, 5),
    (102, "Bob", "IT", 7000, 3),
    (103, "Charlie", "Finance", 6500, 8),
    (104, "David", "HR", 4800, 2),
    (105, "Eve", "IT", 7200, 7),
    (106, "Frank", "Finance", 6400, 6),
    (107, "Grace", "IT", 7300, 10),
    (108, "Helen", "HR", 5200, 4),
    (109, "Ivan", "Finance", 6000, 3),
    (110, "Jack", "IT", 7500, 9),
]

rdd = spark.sparkContext.parallelize(employee_data)
print(rdd.collect())

25/03/05 21:19:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[(101, 'Alice', 'HR', 5000, 5), (102, 'Bob', 'IT', 7000, 3), (103, 'Charlie', 'Finance', 6500, 8), (104, 'David', 'HR', 4800, 2), (105, 'Eve', 'IT', 7200, 7), (106, 'Frank', 'Finance', 6400, 6), (107, 'Grace', 'IT', 7300, 10), (108, 'Helen', 'HR', 5200, 4), (109, 'Ivan', 'Finance', 6000, 3), (110, 'Jack', 'IT', 7500, 9)]


In [3]:
# increase salary by 10%
rdd_map = rdd.map(lambda x: (x[0], x[1], x[2], x[3] * 1.1, x[4]))
print(rdd_map.collect())

[(101, 'Alice', 'HR', 5500.0, 5), (102, 'Bob', 'IT', 7700.000000000001, 3), (103, 'Charlie', 'Finance', 7150.000000000001, 8), (104, 'David', 'HR', 5280.0, 2), (105, 'Eve', 'IT', 7920.000000000001, 7), (106, 'Frank', 'Finance', 7040.000000000001, 6), (107, 'Grace', 'IT', 8030.000000000001, 10), (108, 'Helen', 'HR', 5720.000000000001, 4), (109, 'Ivan', 'Finance', 6600.000000000001, 3), (110, 'Jack', 'IT', 8250.0, 9)]


                                                                                

In [4]:
# employees with more than 5 year experience
rdd_filter = rdd.filter(lambda x: x[4] > 5)
print(rdd_filter.collect())

[(103, 'Charlie', 'Finance', 6500, 8), (105, 'Eve', 'IT', 7200, 7), (106, 'Frank', 'Finance', 6400, 6), (107, 'Grace', 'IT', 7300, 10), (110, 'Jack', 'IT', 7500, 9)]


In [5]:
# flatten department into words
rdd_flatmap = rdd.flatMap(lambda x: x[2].split(" "))
print(rdd_flatmap.collect())

['HR', 'IT', 'Finance', 'HR', 'IT', 'Finance', 'IT', 'HR', 'Finance', 'IT']


In [6]:
# group by department
rdd_kv = rdd.map(lambda x: (x[2], x[1]))  
rdd_grouped = rdd_kv.groupByKey()
print({k: list(v) for k, v in rdd_grouped.collect()})

{'Finance': ['Charlie', 'Frank', 'Ivan'], 'HR': ['Alice', 'David', 'Helen'], 'IT': ['Bob', 'Eve', 'Grace', 'Jack']}


In [7]:
# total salary in each department
rdd_salary_kv = rdd.map(lambda x: (x[2], x[3])) 
rdd_salary_total = rdd_salary_kv.reduceByKey(lambda x, y: x + y)
print(rdd_salary_total.collect())

[('Finance', 18900), ('HR', 15000), ('IT', 29000)]


In [8]:
# sort by salary
rdd_sorted = rdd.map(lambda x: (x[3], x)).sortByKey(ascending=False)
print(rdd_sorted.collect())

[(7500, (110, 'Jack', 'IT', 7500, 9)), (7300, (107, 'Grace', 'IT', 7300, 10)), (7200, (105, 'Eve', 'IT', 7200, 7)), (7000, (102, 'Bob', 'IT', 7000, 3)), (6500, (103, 'Charlie', 'Finance', 6500, 8)), (6400, (106, 'Frank', 'Finance', 6400, 6)), (6000, (109, 'Ivan', 'Finance', 6000, 3)), (5200, (108, 'Helen', 'HR', 5200, 4)), (5000, (101, 'Alice', 'HR', 5000, 5)), (4800, (104, 'David', 'HR', 4800, 2))]


In [9]:
# unique department
rdd_distinct = rdd.map(lambda x: x[2]).distinct()
print(rdd_distinct.collect())

['Finance', 'HR', 'IT']


In [11]:
# add 2 employees
extra_employees = [(111, "Karen", "IT", 6800, 5), (112, "Liam", "HR", 5700, 6)]
rdd_extra = spark.sparkContext.parallelize(extra_employees)
rdd_union = rdd.union(rdd_extra)
print(rdd_union.collect())

[(101, 'Alice', 'HR', 5000, 5), (102, 'Bob', 'IT', 7000, 3), (103, 'Charlie', 'Finance', 6500, 8), (104, 'David', 'HR', 4800, 2), (105, 'Eve', 'IT', 7200, 7), (106, 'Frank', 'Finance', 6400, 6), (107, 'Grace', 'IT', 7300, 10), (108, 'Helen', 'HR', 5200, 4), (109, 'Ivan', 'Finance', 6000, 3), (110, 'Jack', 'IT', 7500, 9), (111, 'Karen', 'IT', 6800, 5), (112, 'Liam', 'HR', 5700, 6)]


In [23]:
# add bonus
bonus_data = [(101, 500), (102, 800), (103, 600), (104, 300)]
rdd_bonus = spark.sparkContext.parallelize(bonus_data)
rdd_emp_kv = rdd.map(lambda x: (x[0], (x[1], x[2], x[3])))  
rdd_joined = rdd_emp_kv.join(rdd_bonus)
print(rdd_joined.collect())



[(104, (('David', 'HR', 4800), 300)), (101, (('Alice', 'HR', 5000), 500)), (102, (('Bob', 'IT', 7000), 800)), (103, (('Charlie', 'Finance', 6500), 600))]


                                                                                

In [13]:
def salary_sum(iterator):
    yield sum(x[3] for x in iterator)

rdd_partitioned = rdd.mapPartitions(salary_sum)
print(rdd_partitioned.collect())

[12000, 11300, 13600, 26000]


In [24]:
names_rdd = employee_rdd.map(lambda x: x[1])  # Extracting Names
departments_rdd = employee_rdd.map(lambda x: x[2])  # Extracting Departments

# Zipping both RDDs
zipped_rdd = names_rdd.zip(departments_rdd)

# Collect and print the results
print(zipped_rdd.collect())

NameError: name 'employee_rdd' is not defined

In [14]:
print(rdd.count())

10


In [15]:
total_salary = rdd.map(lambda x: x[3]).reduce(lambda a, b: a + b)
print(total_salary)

62900


In [16]:
print(rdd.first())

(101, 'Alice', 'HR', 5000, 5)


In [17]:
rdd.saveAsTextFile("output_employee_data")

In [18]:
rdd.cache()
print(rdd.count())

10


In [21]:
rdd_coalesce = rdd.coalesce(2)
print(rdd_coalesce.getNumPartitions())

2


In [20]:
rdd_repartition = rdd.repartition(4)
print(rdd_repartition.getNumPartitions())

4
