In [2]:
import os
os.environ['SPARK_HOME'] = "C:/spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *

spark = SparkSession.builder.appName("sparkkss").getOrCreate()
spark


In [4]:
employee_data = [
    (1, "John"), (2, "Alice"), (3, "Bob"), (4, "Emily"),
    (5, "David"), (6, "Sarah"), (7, "Michael"), (8, "Lisa"),
    (9, "William")
]
employees = spark.createDataFrame(employee_data, ["id", "name"])

salary_data = [
    ("HR", 1, 60000), ("HR", 2, 55000), ("HR", 3, 58000),
    ("IT", 4, 70000), ("IT", 5, 72000), ("IT", 6, 68000),
    ("Sales", 7, 75000), ("Sales", 8, 78000), ("Sales", 9, 77000)
]
salaries = spark.createDataFrame(salary_data, ["department", "id", "salary"])

employees.show()

salaries.show()

+---+-------+
| id|   name|
+---+-------+
|  1|   John|
|  2|  Alice|
|  3|    Bob|
|  4|  Emily|
|  5|  David|
|  6|  Sarah|
|  7|Michael|
|  8|   Lisa|
|  9|William|
+---+-------+

+----------+---+------+
|department| id|salary|
+----------+---+------+
|        HR|  1| 60000|
|        HR|  2| 55000|
|        HR|  3| 58000|
|        IT|  4| 70000|
|        IT|  5| 72000|
|        IT|  6| 68000|
|     Sales|  7| 75000|
|     Sales|  8| 78000|
|     Sales|  9| 77000|
+----------+---+------+



In [5]:
employees.createOrReplaceTempView("employees")
salaries.createOrReplaceTempView("salaries")

In [7]:
view_exists = spark.catalog.tableExists("salaries")
view_exists

True

In [9]:
empQuery=spark.sql("Select * from employees")
empQuery.show()

+---+-------+
| id|   name|
+---+-------+
|  1|   John|
|  2|  Alice|
|  3|    Bob|
|  4|  Emily|
|  5|  David|
|  6|  Sarah|
|  7|Michael|
|  8|   Lisa|
|  9|William|
+---+-------+



In [12]:
salQ= spark.sql("Select department,AVG(salary) from salaries Group By(department)")
salQ.show()

+----------+------------------+
|department|       avg(salary)|
+----------+------------------+
|        HR|57666.666666666664|
|        IT|           70000.0|
|     Sales| 76666.66666666667|
+----------+------------------+



In [15]:
comb = spark.sql("SELECT name FROM employees WHERE id IN (SELECT id FROM salaries WHERE salary > (SELECT AVG(salary) FROM salaries))")
comb.show()

+-------+
|   name|
+-------+
|  Emily|
|  David|
|Michael|
|   Lisa|
|William|
+-------+



In [19]:
widow = Window.partitionBy("department").orderBy(desc("salary"))
employee_sal = spark.sql("Select * from salaries left join employees on salaries.id=employees.id")
employee_sal.show()
employee_sal.withColumn("rank",rank().over(widow)).show()

+----------+---+------+---+-------+
|department| id|salary| id|   name|
+----------+---+------+---+-------+
|        HR|  1| 60000|  1|   John|
|        HR|  2| 55000|  2|  Alice|
|        HR|  3| 58000|  3|    Bob|
|        IT|  4| 70000|  4|  Emily|
|        IT|  5| 72000|  5|  David|
|        IT|  6| 68000|  6|  Sarah|
|     Sales|  7| 75000|  7|Michael|
|     Sales|  8| 78000|  8|   Lisa|
|     Sales|  9| 77000|  9|William|
+----------+---+------+---+-------+

+----------+---+------+---+-------+----+
|department| id|salary| id|   name|rank|
+----------+---+------+---+-------+----+
|        HR|  1| 60000|  1|   John|   1|
|        HR|  3| 58000|  3|    Bob|   2|
|        HR|  2| 55000|  2|  Alice|   3|
|        IT|  5| 72000|  5|  David|   1|
|        IT|  4| 70000|  4|  Emily|   2|
|        IT|  6| 68000|  6|  Sarah|   3|
|     Sales|  8| 78000|  8|   Lisa|   1|
|     Sales|  9| 77000|  9|William|   2|
|     Sales|  7| 75000|  7|Michael|   3|
+----------+---+------+---+-------+----