In [1]:
import findspark
findspark.init()  # Automatically finds and sets SPARK_HOME

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

# Check Spark version
print("Spark version:", spark.version)
sc=spark.sparkContext

Spark version: 3.5.5


In [2]:
rdd=sc.parallelize([1,2,3,4,5])
rdd.collect()

[1, 2, 3, 4, 5]

In [4]:
rdd.count()

5

In [5]:
rdd.first()

1

In [6]:
rdd.take(3)

[1, 2, 3]

In [7]:
rdd.map(lambda x:x*2).collect()

[2, 4, 6, 8, 10]

In [8]:
rdd.filter(lambda x:x%2==0).collect()

[2, 4]

In [9]:
sc.parallelize(["hello world"]).flatMap(lambda x:x.split(" ")).collect()

['hello', 'world']

In [10]:
rdd.reduce(lambda x,y:x+y)

15

In [11]:
rdd.distinct().collect()

[1, 2, 3, 4, 5]

In [12]:
rdd.sortBy(lambda x:-x).collect()

[5, 4, 3, 2, 1]

In [13]:
rdd1=sc.parallelize([1,2,3,4,5])
rdd2=sc.parallelize([5,2,8,4,7])

In [14]:
rdd1.union(rdd2).collect()

[1, 2, 3, 4, 5, 5, 2, 8, 4, 7]

In [15]:
rdd1.intersection(rdd2).collect()

[2, 4, 5]

In [16]:
rdd1.subtract(rdd2).collect()

[1, 3]

In [17]:
rdd1.cartesian(rdd2).collect()

[(1, 5),
 (1, 2),
 (1, 8),
 (1, 4),
 (1, 7),
 (2, 5),
 (2, 2),
 (2, 8),
 (2, 4),
 (2, 7),
 (3, 5),
 (3, 2),
 (3, 8),
 (3, 4),
 (3, 7),
 (4, 5),
 (4, 2),
 (4, 8),
 (4, 4),
 (4, 7),
 (5, 5),
 (5, 2),
 (5, 8),
 (5, 4),
 (5, 7)]

In [19]:
pair_rdd=sc.parallelize([("a",1),("b",2),("a",3)])

In [20]:
pair_rdd.groupByKey().mapValues(list).collect()

[('b', [2]), ('a', [1, 3])]

In [22]:
pair_rdd.reduceByKey(lambda x,y:x+y).collect()

[('b', 2), ('a', 4)]

In [23]:
rdd.countByValue()

defaultdict(int, {1: 1, 2: 1, 3: 1, 4: 1, 5: 1})

In [24]:
pair_rdd1=sc.parallelize([("1",2),("1",3),("2",3),("2",4),("3",2),("4",3),("5",9),("5",8),("5",5)])

In [26]:
pair_rdd1.groupByKey().mapValues(list).collect()

[('5', [9, 8, 5]), ('2', [3, 4]), ('3', [2]), ('4', [3]), ('1', [2, 3])]

In [27]:
pair_rdd1.reduceByKey(lambda x,y:x+y).collect()

[('5', 22), ('2', 7), ('3', 2), ('4', 3), ('1', 5)]

In [47]:
new_rdd = sc.textFile("EVM.txt")

In [48]:
new_rdd.collect()

['LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF']

In [49]:
new_rdd.countByValue()

defaultdict(int,
            {'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK': 7,
             'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF': 7})

In [58]:
rdd_new = new_rdd.flatMap(lambda x: x.split(" "))
rdd_new.collect()

['LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',

In [60]:
rdd_new.countByValue()

defaultdict(int, {'LDF': 28, 'BJP': 28, 'IUF': 28, 'ADMK': 28, 'TVK': 28})

# OR METHOD

In [63]:
new_rdd = sc.textFile("EVM.txt")
new_rdd.collect()

['LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'LDF BJP IUF ADMK TVK LDF BJP IUF ADMK TVK',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF',
 'TVK ADMK IUF BJP LDF TVK ADMK IUF BJP LDF']

In [65]:
words_rdd=new_rdd.flatMap(lambda x:x.split(" "))
words_rdd.collect()

['LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'LDF',
 'BJP',
 'IUF',
 'ADMK',
 'TVK',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',
 'IUF',
 'BJP',
 'LDF',
 'TVK',
 'ADMK',

In [66]:
pair_rdd=words_rdd.map(lambda x:(x,1))
pair_rdd.collect()

[('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('LDF', 1),
 ('BJP', 1),
 ('IUF', 1),
 ('ADMK', 1),
 ('TVK', 1),
 ('TVK', 1),
 ('ADMK', 1),
 ('IUF', 1),
 ('BJP', 1),
 ('LDF', 1),
 ('TVK', 1

In [69]:
word_counts = pair_rdd.reduceByKey(lambda x,y:x+y)
word_counts.collect()

[('LDF', 28), ('IUF', 28), ('TVK', 28), ('BJP', 28), ('ADMK', 28)]

In [71]:
emp_data = [
    ["001", "101", "John Doe", "30", "Male", "50000", "2015-01-01"],
    ["002", "102", "Jane Smith", "28", "Female", "52000", "2016-03-15"],
    ["003", "103", "Robert Brown", "35", "Male", "60000", "2014-07-22"],
    ["004", "104", "Emily Johnson", "32", "Female", "58000", "2017-10-10"],
    ["005", "105", "Michael Davis", "40", "Male", "65000", "2013-12-05"],
    ["006", "106", "Sarah Wilson", "29", "Female", "54000", "2018-06-30"],
    ["007", "107", "David Martinez", "31", "Male", "57000", "2016-09-25"],
    ["008", "108", "Laura Garcia", "27", "Female", "51000", "2019-02-18"],
    ["009", "109", "James Thomas", "38", "Male", "62000", "2015-05-14"],
    ["010", "110", "Olivia Hernandez", "33", "Female", "59000", "2014-11-08"]
]

In [73]:
columns = "employee_ID string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [74]:
emp = spark.createDataFrame(data=emp_data,schema=columns)

In [75]:
emp.show()

+-----------+-------------+----------------+---+------+------+----------+
|employee_ID|department_id|            name|age|gender|salary| hire_date|
+-----------+-------------+----------------+---+------+------+----------+
|        001|          101|        John Doe| 30|  Male| 50000|2015-01-01|
|        002|          102|      Jane Smith| 28|Female| 52000|2016-03-15|
|        003|          103|    Robert Brown| 35|  Male| 60000|2014-07-22|
|        004|          104|   Emily Johnson| 32|Female| 58000|2017-10-10|
|        005|          105|   Michael Davis| 40|  Male| 65000|2013-12-05|
|        006|          106|    Sarah Wilson| 29|Female| 54000|2018-06-30|
|        007|          107|  David Martinez| 31|  Male| 57000|2016-09-25|
|        008|          108|    Laura Garcia| 27|Female| 51000|2019-02-18|
|        009|          109|    James Thomas| 38|  Male| 62000|2015-05-14|
|        010|          110|Olivia Hernandez| 33|Female| 59000|2014-11-08|
+-----------+-------------+-----------

In [76]:
spark