In [1]:
from pyspark.sql import SparkSession

# Spark session
spark = SparkSession.builder.appName("PySparkSteppingUp").getOrCreate()

print("Apache Spark version:", spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/01 19:01:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/01 19:01:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/02/01 19:01:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Apache Spark version: 3.5.4


In [2]:
# Read the CSV file into a DataFrame
df = spark.read.csv("data/sample.csv", header=True, inferSchema=True)

# Display the first few rows of the DataFrame
df.show()

+---+-------+---+------+-----------+
| id|   name|age|salary| department|
+---+-------+---+------+-----------+
|  1|  Alice| 30| 70000|         HR|
|  2|    Bob| 35| 80000|Engineering|
|  3|Charlie| 25| 50000|  Marketing|
|  4|  David| 40| 90000|Engineering|
|  5|    Eva| 28| 60000|         HR|
|  6|  Frank| 32| 75000|  Marketing|
|  7|   Gina| 27| 55000|Engineering|
|  8|  Harry| 31| 70000|         HR|
|  9|    Ivy| 29| 60000|  Marketing|
| 10|   Jack| 33| 80000|Engineering|
| 11|   Kate| 26| 50000|         HR|
| 12|   Lily| 34| 75000|  Marketing|
| 13|   Mike| 28| 60000|Engineering|
| 14|  Nancy| 30| 70000|         HR|
| 15|  Oscar| 32| 80000|  Marketing|
+---+-------+---+------+-----------+



In [3]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



## Window Functions - More advanced "Aggregations"

> Window functions allow you to perform calculations across a set of rows that are related to the current row.

In [4]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank

# Define a window partitioned by 'department' and ordered by 'salary' in descending order
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

# Apply the window function to compute the rank of employees within their department
df_with_rank = df.withColumn("salary_rank", rank().over(window_spec))
df_with_rank.show()

# The 'Window.partitionBy' divides data into groups (departments),
# and 'orderBy' sorts employees within each group by salary in desc order. 
# The 'rank' function assigns a rank to each row based on the order.

+---+-------+---+------+-----------+-----------+
| id|   name|age|salary| department|salary_rank|
+---+-------+---+------+-----------+-----------+
|  4|  David| 40| 90000|Engineering|          1|
|  2|    Bob| 35| 80000|Engineering|          2|
| 10|   Jack| 33| 80000|Engineering|          2|
| 13|   Mike| 28| 60000|Engineering|          4|
|  7|   Gina| 27| 55000|Engineering|          5|
|  1|  Alice| 30| 70000|         HR|          1|
|  8|  Harry| 31| 70000|         HR|          1|
| 14|  Nancy| 30| 70000|         HR|          1|
|  5|    Eva| 28| 60000|         HR|          4|
| 11|   Kate| 26| 50000|         HR|          5|
| 15|  Oscar| 32| 80000|  Marketing|          1|
|  6|  Frank| 32| 75000|  Marketing|          2|
| 12|   Lily| 34| 75000|  Marketing|          2|
|  9|    Ivy| 29| 60000|  Marketing|          4|
|  3|Charlie| 25| 50000|  Marketing|          5|
+---+-------+---+------+-----------+-----------+

