In [0]:
#here we created the dataframe now we have to do row_num, rank, runningtotal

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import row_number,rank, sum as spark_sum,dense_rank

spark = SparkSession.builder.appName("window example").getOrCreate()

data = [
    ('Alice', '2023-01-01', 500),
    ('Alice', '2023-01-02', 600),
    ('Alice', '2023-01-03', 700),
    ('Bob', '2023-01-01', 400),
    ('Bob', '2023-01-02', 300),
    ('Bob', '2023-01-03', 200),
    ('Charlie', '2023-01-01', 700),
    ('Charlie', '2023-01-02', 800),
    ('Charlie', '2023-01-03', 900)
]

columns = ['name', 'date', 'sales']

df = spark.createDataFrame(data = data,schema = columns)
df.show()


+-------+----------+-----+
|   name|      date|sales|
+-------+----------+-----+
|  Alice|2023-01-01|  500|
|  Alice|2023-01-02|  600|
|  Alice|2023-01-03|  700|
|    Bob|2023-01-01|  400|
|    Bob|2023-01-02|  300|
|    Bob|2023-01-03|  200|
|Charlie|2023-01-01|  700|
|Charlie|2023-01-02|  800|
|Charlie|2023-01-03|  900|
+-------+----------+-----+



In [0]:
#1.ro_num

window_spec = Window.partitionBy("name").orderBy("date")
df_row_num = df.withColumn("row_number", row_number().over(window_spec)).show()

+-------+----------+-----+----------+
|   name|      date|sales|row_number|
+-------+----------+-----+----------+
|  Alice|2023-01-01|  500|         1|
|  Alice|2023-01-02|  600|         2|
|  Alice|2023-01-03|  700|         3|
|    Bob|2023-01-01|  400|         1|
|    Bob|2023-01-02|  300|         2|
|    Bob|2023-01-03|  200|         3|
|Charlie|2023-01-01|  700|         1|
|Charlie|2023-01-02|  800|         2|
|Charlie|2023-01-03|  900|         3|
+-------+----------+-----+----------+



In [0]:
#2.rank
df_rank = df.withColumn("rank",rank().over(window_spec)).show()

+-------+----------+-----+----+
|   name|      date|sales|rank|
+-------+----------+-----+----+
|  Alice|2023-01-01|  500|   1|
|  Alice|2023-01-02|  600|   2|
|  Alice|2023-01-03|  700|   3|
|    Bob|2023-01-01|  400|   1|
|    Bob|2023-01-02|  300|   2|
|    Bob|2023-01-03|  200|   3|
|Charlie|2023-01-01|  700|   1|
|Charlie|2023-01-02|  800|   2|
|Charlie|2023-01-03|  900|   3|
+-------+----------+-----+----+



In [0]:
#3.running_total

df_running_total = df.withColumn("running_total", spark_sum("sales").over(window_spec)).show()

+-------+----------+-----+-------------+
|   name|      date|sales|running_total|
+-------+----------+-----+-------------+
|  Alice|2023-01-01|  500|          500|
|  Alice|2023-01-02|  600|         1100|
|  Alice|2023-01-03|  700|         1800|
|    Bob|2023-01-01|  400|          400|
|    Bob|2023-01-02|  300|          700|
|    Bob|2023-01-03|  200|          900|
|Charlie|2023-01-01|  700|          700|
|Charlie|2023-01-02|  800|         1500|
|Charlie|2023-01-03|  900|         2400|
+-------+----------+-----+-------------+



In [0]:
#4.lead and lag

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import lead, lag

spark = SparkSession.builder.appName("lead and lag function").getOrCreate()

data = [
    ('Alice', '2023-01-01', 500),
    ('Alice', '2023-01-02', 600),
    ('Alice', '2023-01-03', 700),
    ('Bob', '2023-01-01', 400),
    ('Bob', '2023-01-02', 300),
    ('Bob', '2023-01-03', 200),
    ('Charlie', '2023-01-01', 700),
    ('Charlie', '2023-01-02', 800),
    ('Charlie', '2023-01-03', 900)
]

columns = ['name', 'date', 'sales']

df = spark.createDataFrame(data = data,schema = columns)

#windowspec
window_spec = Window.partitionBy("name").orderBy("date")

#lead and lag
df_with_lead_lag = df \
    .withColumn("next_day_sales", lead("sales", 1).over(window_spec)) \
    .withColumn("previous_day_sales", lag("sales", 1).over(window_spec))

df_with_lead_lag.show()


+-------+----------+-----+--------------+------------------+
|   name|      date|sales|next_day_sales|previous_day_sales|
+-------+----------+-----+--------------+------------------+
|  Alice|2023-01-01|  500|           600|              null|
|  Alice|2023-01-02|  600|           700|               500|
|  Alice|2023-01-03|  700|          null|               600|
|    Bob|2023-01-01|  400|           300|              null|
|    Bob|2023-01-02|  300|           200|               400|
|    Bob|2023-01-03|  200|          null|               300|
|Charlie|2023-01-01|  700|           800|              null|
|Charlie|2023-01-02|  800|           900|               700|
|Charlie|2023-01-03|  900|          null|               800|
+-------+----------+-----+--------------+------------------+



In [0]:
#1. interview questions

