In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySpark-JSON-CSV-Example").getOrCreate()


In [3]:
employees_df = spark.read.csv("employees.csv", header=True, inferSchema=True)
employees_df.show()


+---+------------+----------+------+
| id|        name|department|salary|
+---+------------+----------+------+
|  1|Rahul Sharma|        IT| 55000|
|  2| Priya Singh|        HR| 60000|
|  3|  Aman Kumar|   Finance| 48000|
|  4| Sneha Reddy| Marketing| 52000|
|  5| Arjun Mehta|        IT| 75000|
+---+------------+----------+------+



In [4]:
departments_df = spark.read.json("departments.json")
departments_df.show()

+---------------+-------+-------+
|_corrupt_record|   dept|manager|
+---------------+-------+-------+
|              [|   NULL|   NULL|
|           NULL|     IT|  Karan|
|           NULL|     HR|  Divya|
|           NULL|Finance| Suresh|
|              ]|   NULL|   NULL|
+---------------+-------+-------+



In [6]:
employees_df.filter(employees_df.salary > 68000).show()



+---+-----------+----------+------+
| id|       name|department|salary|
+---+-----------+----------+------+
|  5|Arjun Mehta|        IT| 75000|
+---+-----------+----------+------+



In [8]:
employees_df.groupBy("department").avg("salary").show()


+----------+-----------+
|department|avg(salary)|
+----------+-----------+
|        HR|    60000.0|
|   Finance|    48000.0|
| Marketing|    52000.0|
|        IT|    65000.0|
+----------+-----------+



In [9]:
from pyspark.sql import functions as F

employees_df.agg(
    F.max("salary").alias("MaxSalary"),
    F.min("salary").alias("MinSalary")
).show()


+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    75000|    48000|
+---------+---------+



In [12]:
emp_dept_df = employees_df.join(departments_df, employees_df.department == departments_df.dept, "inner")
emp_dept_df.show()

+---+------------+----------+------+---------------+-------+-------+
| id|        name|department|salary|_corrupt_record|   dept|manager|
+---+------------+----------+------+---------------+-------+-------+
|  1|Rahul Sharma|        IT| 55000|           NULL|     IT|  Karan|
|  2| Priya Singh|        HR| 60000|           NULL|     HR|  Divya|
|  3|  Aman Kumar|   Finance| 48000|           NULL|Finance| Suresh|
|  5| Arjun Mehta|        IT| 75000|           NULL|     IT|  Karan|
+---+------------+----------+------+---------------+-------+-------+



In [16]:
# Register as temp views
employees_df.createOrReplaceTempView("employees")
departments_df.createOrReplaceTempView("departments")

spark.sql("SELECT name, department, salary FROM employees WHERE salary > 48000").show()

spark.sql("""
    SELECT department, AVG(salary) as avg_salary
    FROM employees
    GROUP BY department
""").show()

+------------+----------+------+
|        name|department|salary|
+------------+----------+------+
|Rahul Sharma|        IT| 55000|
| Priya Singh|        HR| 60000|
| Sneha Reddy| Marketing| 52000|
| Arjun Mehta|        IT| 75000|
+------------+----------+------+

+----------+----------+
|department|avg_salary|
+----------+----------+
|        HR|   60000.0|
|   Finance|   48000.0|
| Marketing|   52000.0|
|        IT|   65000.0|
+----------+----------+

