# Question 1
# Apply Transformations & actions in pyspark

## Transformation

Creating RDD

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Transformations_Actions_Assessment").getOrCreate()
data = [
    ("Amit", "IT", 60000),
    ("Priya", "HR", 50000),
    ("Ravi", "Finance", 70000),
    ("Neha", "IT", 65000),
    ("Vikram", "Finance", 55000),
    ("Anjali", "HR", 48000),
    ("Suresh", "IT", 72000),
    ("Kavya", "Finance", 62000),
    ("Rahul", "HR", 51000),
    ("Meena", "IT", 58000)
]

rdd = spark.sparkContext.parallelize(data)


1) map()

Write a transformation to increase every employee’s salary by 10%.

In [2]:
map_rdd = rdd.map(lambda x:(x[0], x[1], x[2]*1.10))
map_rdd.collect()

[('Amit', 'IT', 66000.0),
 ('Priya', 'HR', 55000.00000000001),
 ('Ravi', 'Finance', 77000.0),
 ('Neha', 'IT', 71500.0),
 ('Vikram', 'Finance', 60500.00000000001),
 ('Anjali', 'HR', 52800.00000000001),
 ('Suresh', 'IT', 79200.0),
 ('Kavya', 'Finance', 68200.0),
 ('Rahul', 'HR', 56100.00000000001),
 ('Meena', 'IT', 63800.00000000001)]

2) flatMap()

Split each employee record into individual words.


In [4]:
flat_map = rdd.flatMap(lambda x: [x[0], x[1], str(x[2])])
for element in flat_map.collect():
    print(element)

Amit
IT
60000
Priya
HR
50000
Ravi
Finance
70000
Neha
IT
65000
Vikram
Finance
55000
Anjali
HR
48000
Suresh
IT
72000
Kavya
Finance
62000
Rahul
HR
51000
Meena
IT
58000


3) filter()

Filter out employees whose salary is greater than 60,000.


In [5]:
filter_rdd = rdd.filter(lambda x: x[2] > 60000)
filter_rdd.collect()

[('Ravi', 'Finance', 70000),
 ('Neha', 'IT', 65000),
 ('Suresh', 'IT', 72000),
 ('Kavya', 'Finance', 62000)]

4) reduceByKey()

Find the total salary paid for each department.


In [8]:
total_sal = rdd.map(lambda x: (x[1], x[2]))
red_rdd =total_sal.reduceByKey(lambda a, b: a + b)
red_rdd.collect()

[('IT', 255000), ('HR', 149000), ('Finance', 187000)]

5) sortByKey()

Sort employees by department name alphabetically.

In [9]:
sorted_emp = rdd.map(lambda x: (x[1], x[0])).sortByKey()
sorted_emp.collect()

[('Finance', 'Ravi'),
 ('Finance', 'Vikram'),
 ('Finance', 'Kavya'),
 ('HR', 'Priya'),
 ('HR', 'Anjali'),
 ('HR', 'Rahul'),
 ('IT', 'Amit'),
 ('IT', 'Neha'),
 ('IT', 'Suresh'),
 ('IT', 'Meena')]

## Action

1) collect()

Collect all employee data

In [10]:
rdd.collect()

[('Amit', 'IT', 60000),
 ('Priya', 'HR', 50000),
 ('Ravi', 'Finance', 70000),
 ('Neha', 'IT', 65000),
 ('Vikram', 'Finance', 55000),
 ('Anjali', 'HR', 48000),
 ('Suresh', 'IT', 72000),
 ('Kavya', 'Finance', 62000),
 ('Rahul', 'HR', 51000),
 ('Meena', 'IT', 58000)]

2) count()

Count the Number of Employees in the dataset

In [11]:
rdd.count()

10

3) first()

Retrieve the first employee record.

In [12]:
rdd.first()

('Amit', 'IT', 60000)

4) take()

Take the top 3 employees

In [13]:
rdd.take(3)

[('Amit', 'IT', 60000), ('Priya', 'HR', 50000), ('Ravi', 'Finance', 70000)]

5) saveAsTextFile()

Save the RDD result of employees in HR department into a text file.




In [14]:
hr_dept = rdd.filter(lambda x: x[1] == 'HR')
save_file = hr_dept.saveAsTextFile("Hr_Employees")

#Question 2:

#Transformation using filter, Joins, groupBy() and aggregations, window functions

Creating sample data

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("Transformations_Assessment").getOrCreate()

employee_data = [
    (1, "Amit", "IT", 60000),
    (2, "Priya", "HR", 50000),
    (3, "Ravi", "Finance", 70000),
    (4, "Neha", "IT", 65000),
    (5, "Vikram", "Finance", 55000),
    (6, "Anjali", "HR", 48000),
    (7, "Suresh", "IT", 72000),
    (8, "Kavya", "Finance", 62000),
    (9, "Rahul", "HR", 51000),
    (10, "Meena", "IT", 58000)
]

department_data = [
    ("IT", "Bangalore"),
    ("HR", "Mumbai"),
    ("Finance", "Delhi")
]

df_employees = spark.createDataFrame(employee_data, ["emp_id", "name", "department", "salary"])
df_departments = spark.createDataFrame(department_data, ["department", "location"])

df_employees.show()
df_departments.show()


+------+------+----------+------+
|emp_id|  name|department|salary|
+------+------+----------+------+
|     1|  Amit|        IT| 60000|
|     2| Priya|        HR| 50000|
|     3|  Ravi|   Finance| 70000|
|     4|  Neha|        IT| 65000|
|     5|Vikram|   Finance| 55000|
|     6|Anjali|        HR| 48000|
|     7|Suresh|        IT| 72000|
|     8| Kavya|   Finance| 62000|
|     9| Rahul|        HR| 51000|
|    10| Meena|        IT| 58000|
+------+------+----------+------+

+----------+---------+
|department| location|
+----------+---------+
|        IT|Bangalore|
|        HR|   Mumbai|
|   Finance|    Delhi|
+----------+---------+



1) filter

Filter employees whose salary is greater than 60,000 and department is IT or Finance

In [19]:
from pyspark.sql.functions import col
df1 = df_employees.filter((col("salary") > 60000) &
      ((col("department") == 'IT') | (col("department")=='Finance')))
df1.show()

+------+------+----------+------+
|emp_id|  name|department|salary|
+------+------+----------+------+
|     3|  Ravi|   Finance| 70000|
|     4|  Neha|        IT| 65000|
|     7|Suresh|        IT| 72000|
|     8| Kavya|   Finance| 62000|
+------+------+----------+------+



2) Join

Join employees with departments to get their location

In [21]:
df1 = df_employees.join(df_departments, on="department", how="inner")
df1.show()

+----------+------+------+------+---------+
|department|emp_id|  name|salary| location|
+----------+------+------+------+---------+
|   Finance|     3|  Ravi| 70000|    Delhi|
|   Finance|     5|Vikram| 55000|    Delhi|
|   Finance|     8| Kavya| 62000|    Delhi|
|        HR|     2| Priya| 50000|   Mumbai|
|        HR|     6|Anjali| 48000|   Mumbai|
|        HR|     9| Rahul| 51000|   Mumbai|
|        IT|     1|  Amit| 60000|Bangalore|
|        IT|     4|  Neha| 65000|Bangalore|
|        IT|     7|Suresh| 72000|Bangalore|
|        IT|    10| Meena| 58000|Bangalore|
+----------+------+------+------+---------+



3) GroupBy with Aggregations

Find the average salary of employees in each department.

In [24]:
from pyspark.sql.functions import avg
df1 = df_employees.groupBy("department").agg(avg("salary").alias("avg_salary"))
df1.show()

+----------+------------------+
|department|        avg_salary|
+----------+------------------+
|        HR|49666.666666666664|
|   Finance|62333.333333333336|
|        IT|           63750.0|
+----------+------------------+



4) Simple Aggregations

- Find the maximum salary and minimum salary across all employees.

- Find the avg salary

In [35]:
from pyspark.sql.functions import max, min
df1 = df_employees.agg(max("salary").alias("max_salary"), min("salary").alias("min_salary"), avg("salary").alias("avg_salary"))
df1.show()


+----------+----------+----------+
|max_salary|min_salary|avg_salary|
+----------+----------+----------+
|     72000|     48000|   59100.0|
+----------+----------+----------+



5) Window Functions

Find the rank of employees based on salary within each department.

In [34]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, percent_rank,ntile
window_spec = Window.partitionBy("department").orderBy(F.col("salary").desc())
#rank
df1 = df_employees.withColumn("salary_rank", rank().over(window_spec))
df1.show()
#percent rank
df2 = df_employees.withColumn("salary_rank", percent_rank().over(window_spec))
df2.show()


+------+------+----------+------+-----------+
|emp_id|  name|department|salary|salary_rank|
+------+------+----------+------+-----------+
|     3|  Ravi|   Finance| 70000|          1|
|     8| Kavya|   Finance| 62000|          2|
|     5|Vikram|   Finance| 55000|          3|
|     9| Rahul|        HR| 51000|          1|
|     2| Priya|        HR| 50000|          2|
|     6|Anjali|        HR| 48000|          3|
|     7|Suresh|        IT| 72000|          1|
|     4|  Neha|        IT| 65000|          2|
|     1|  Amit|        IT| 60000|          3|
|    10| Meena|        IT| 58000|          4|
+------+------+----------+------+-----------+

+------+------+----------+------+------------------+
|emp_id|  name|department|salary|       salary_rank|
+------+------+----------+------+------------------+
|     3|  Ravi|   Finance| 70000|               0.0|
|     8| Kavya|   Finance| 62000|               0.5|
|     5|Vikram|   Finance| 55000|               1.0|
|     9| Rahul|        HR| 51000|    