* timestamp()
* to_date()
* date_format()
* months_between()
* explode nested array into rows
* collect_list()
* collect_set()
* countDistinct()
* map_keys() / Value()
* struct()
* rowNumber() Window function
* rank Window Function
* percent_rank Window Function
* lag Window Function
* lead Window Function

In [9]:
import findspark
findspark.init()
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.types import StructField, StructType, StringType, MapType
from pyspark.sql.functions import col,lit,expr,explode,when,concat,split,substring,translate,to_timestamp
from pyspark.sql.functions import *
from pyspark.sql.types import StructType,StructField, StringType, IntegerType


In [2]:
spark=SparkSession.builder.appName("Train_Hard").getOrCreate()

In [4]:
df=spark.createDataFrame(
        data = [ ("1","2019-06-24 12:01:19.000")],
        schema=["id","input_timestamp"])
df.show()

+---+--------------------+
| id|     input_timestamp|
+---+--------------------+
|  1|2019-06-24 12:01:...|
+---+--------------------+



#### Converting to timestamp

In [6]:
df.withColumn("time",to_timestamp("input_timestamp")).show()

+---+--------------------+-------------------+
| id|     input_timestamp|               time|
+---+--------------------+-------------------+
|  1|2019-06-24 12:01:...|2019-06-24 12:01:19|
+---+--------------------+-------------------+



# Note
#### - Spark TimestampType format 'yyyy-MM-dd  HH:mm:ss.SSS'.

- Note that when dates are not in Spark Tiemstamp format, all Spark functions returns null
- Hence, first convert the input dates to Spark DateType using to_timestamp function

In [10]:
df.select(to_timestamp(lit('06-24-2019 12:01:19.000'),'MM-dd-yyyy HH:mm:ss.SSSS').alias("DT")) \
  .show()

+-------------------+
|                 DT|
+-------------------+
|2019-06-24 12:01:19|
+-------------------+



In [11]:
df.select(to_timestamp(lit('06-24-2019 12:01:19.000'),'dd-MM-yyyy HH:mm:ss.SSSS').alias("DT")) \
  .show()

+----+
|  DT|
+----+
|null|
+----+



# to_date() – 
* Convert Timestamp to Date

In [12]:
df=spark.createDataFrame(
        data = [ ("1","2019-06-24 12:01:19.000")],
        schema=["id","input_timestamp"])
df.show()

+---+--------------------+
| id|     input_timestamp|
+---+--------------------+
|  1|2019-06-24 12:01:...|
+---+--------------------+



#### Converting time string to date

In [16]:
df.withColumn("date_type",to_date("input_timestamp")).show(truncate=False)

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+



#### fetching current date and timestamp

In [17]:
df.withColumn("Curr_date",current_date()).show(truncate=False)

+---+-----------------------+----------+
|id |input_timestamp        |Curr_date |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2022-10-31|
+---+-----------------------+----------+



In [19]:
df.withColumn("Curr_timestamp",current_timestamp()).show(truncate=False)

+---+-----------------------+--------------------------+
|id |input_timestamp        |Curr_timestamp            |
+---+-----------------------+--------------------------+
|1  |2019-06-24 12:01:19.000|2022-10-31 00:31:02.024465|
+---+-----------------------+--------------------------+



#### Convert TimestampType (timestamp) to DateType (date)

In [20]:
df.withColumn("Curr_date",current_date()) \
  .withColumn("timestamp",to_timestamp("input_timestamp")).show(truncate=False)

+---+-----------------------+----------+-------------------+
|id |input_timestamp        |Curr_date |timestamp          |
+---+-----------------------+----------+-------------------+
|1  |2019-06-24 12:01:19.000|2022-10-31|2019-06-24 12:01:19|
+---+-----------------------+----------+-------------------+



### Here is another way to convert TimestampType (timestamp string) to DateType using cast function.

In [21]:
# Using Cast to convert Timestamp String to DateType
df.withColumn('date_type', col('input_timestamp').cast('date')) \
       .show(truncate=False)

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+



In [22]:
# Using Cast to convert TimestampType to DateType
df.withColumn('date_type', to_timestamp('input_timestamp').cast('date')) \
  .show(truncate=False)

+---+-----------------------+----------+
|id |input_timestamp        |date_type |
+---+-----------------------+----------+
|1  |2019-06-24 12:01:19.000|2019-06-24|
+---+-----------------------+----------+



# date_format() 
- Convert Date to String format

In [4]:
df=spark.createDataFrame([["1"]],["id"])
df.show()

+---+
| id|
+---+
|  1|
+---+



In [9]:
df.select(col("*"),current_date().alias("Today's Date"), \
         date_format(current_timestamp(),"yyyy-MM-dd").alias("yr-mnth-day"), \
         date_format(current_timestamp(),"MM/dd/yyyy hh:mm").alias("MM/dd/yyyy")).show()

+---+------------+-----------+----------------+
| id|Today's Date|yr-mnth-day|      MM/dd/yyyy|
+---+------------+-----------+----------------+
|  1|  2022-10-31| 2022-10-31|10/31/2022 01:38|
+---+------------+-----------+----------------+



# datediff()
* difference between two dates

In [10]:
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]
df=spark.createDataFrame(data=data,schema=["id","date"])
df.show()

+---+----------+
| id|      date|
+---+----------+
|  1|2019-07-01|
|  2|2019-06-24|
|  3|2019-08-24|
+---+----------+



In [14]:
df.select(col("*"),current_date().alias("Current_Date"),datediff(current_date(),col("date")).alias("Date_Diff")).show()

+---+----------+------------+---------+
| id|      date|Current_Date|Date_Diff|
+---+----------+------------+---------+
|  1|2019-07-01|  2022-10-31|     1218|
|  2|2019-06-24|  2022-10-31|     1225|
|  3|2019-08-24|  2022-10-31|     1164|
+---+----------+------------+---------+



# months_between()
* to get month and year differences between two dates

In [19]:
df.select(col("*"),current_date().alias("Current_Date"),\
          months_between(current_date(),col("date")).alias("Month_Diff"),
          round(months_between(current_date(),col("date")),2).alias("Round_Month_Diff"),\
          round(months_between(current_date(),col("date"))/lit(12)).alias("Round_Year_Diff")).show()

+---+----------+------------+-----------+----------------+---------------+
| id|      date|Current_Date| Month_Diff|Round_Month_Diff|Round_Year_Diff|
+---+----------+------------+-----------+----------------+---------------+
|  1|2019-07-01|  2022-10-31|39.96774194|           39.97|            3.0|
|  2|2019-06-24|  2022-10-31|40.22580645|           40.23|            3.0|
|  3|2019-08-24|  2022-10-31|38.22580645|           38.23|            3.0|
+---+----------+------------+-----------+----------------+---------------+



## PySpark – explode nested array into rows
* Problem: How to explode & flatten nested array (Array of Array) DataFrame columns into rows using PySpark.

* Solution: PySpark explode function can be used to explode an Array of Array (nested Array) ArrayType(ArrayType(StringType)) columns to rows on PySpark DataFrame using python example.

In [21]:

arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])
df.show(truncate=False)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+



In [22]:
df.select(df.name,explode(df.subjects)).show(truncate=False)

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+



#### If you want to flatten the arrays, use flatten function which converts array of array columns to a single array on DataFrame.

In [23]:
df.select(df.name,flatten(df.subjects)).show(truncate=False)

+-------+-------------------------------+
|name   |flatten(subjects)              |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Java]|
|Robert |[CSharp, VB, Spark, Python]    |
+-------+-------------------------------+



# array_contains() 
* sql function is used to check if array column contains a value. Returns null if the array is null, true if the array contains the value, and false otherwise.

In [34]:

data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.show()

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



In [32]:
df.select(col("*"),array_contains(col("languagesAtSchool"),"Java").alias("array_contains")).show()

+----------------+------------------+---------------+------------+-------------+--------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|array_contains|
+----------------+------------------+---------------+------------+-------------+--------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|          true|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|          true|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|         false|
+----------------+------------------+---------------+------------+-------------+--------------+



# array() 
* function to create a new array column by merging the data from multiple columns. All input columns must have the same data type

In [39]:
df.select(array(df.languagesAtSchool,df.languagesAtWork).alias("Combine")).show(truncate=False)

+-----------------------------------+
|Combine                            |
+-----------------------------------+
|[[Java, Scala, C++], [Spark, Java]]|
|[[Spark, Java, C++], [Spark, Java]]|
|[[CSharp, VB], [Spark, Python]]    |
+-----------------------------------+



# collect_list() 
* function returns all values from an input column with duplicates.

In [40]:

simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.show(truncate=False)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [43]:
df.select(collect_list("salary")).show(truncate=False)

# gives output in the form of a list

+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+



In [44]:
df.select("salary").show()

+------+
|salary|
+------+
|  3000|
|  4600|
|  4100|
|  3000|
|  3000|
|  3300|
|  3900|
|  3000|
|  2000|
|  4100|
+------+



# collect_set() 
* function returns all values from an input column with duplicate values eliminated.



In [45]:
df.select(collect_set("salary")).show(truncate=False)

# gives output in the form of a list with duplicates removed

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



# countDistinct() 
* function returns the number of distinct elements in a columns

In [49]:
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+



In [51]:

print("Distinct Count of Department & Salary: "+str(df2.collect()[0]))

Distinct Count of Department & Salary: Row(count(DISTINCT department, salary)=8)


In [52]:

print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

Distinct Count of Department & Salary: 8


# first() 
* function returns the first element in a column when ignoreNulls is set to true, it returns the first non-null element.



In [53]:
df.select(first("salary")).show()

+-------------+
|first(salary)|
+-------------+
|         3000|
+-------------+



# last()
* when ignoreNulls is set to true, it returns the last non-null element.

In [54]:
df.select(last("salary")).show()

+------------+
|last(salary)|
+------------+
|        4100|
+------------+



# create_map() 
* is used to convert selected DataFrame columns to MapType

In [56]:

data = [ ("36636","Finance",3000,"USA"), 
    ("40288","Finance",5000,"IND"), 
    ("42114","Sales",3900,"USA"), 
    ("39192","Marketing",2500,"CAN"), 
    ("34534","Sales",6500,"USA") ]
schema = StructType([
     StructField('id', StringType(), True),
     StructField('dept', StringType(), True),
     StructField('salary', IntegerType(), True),
     StructField('location', StringType(), True)
     ])

df = spark.createDataFrame(data=data,schema=schema)
df.show(truncate=False)

+-----+---------+------+--------+
|id   |dept     |salary|location|
+-----+---------+------+--------+
|36636|Finance  |3000  |USA     |
|40288|Finance  |5000  |IND     |
|42114|Sales    |3900  |USA     |
|39192|Marketing|2500  |CAN     |
|34534|Sales    |6500  |USA     |
+-----+---------+------+--------+



In [58]:
df.withColumn("propertiesMap",create_map(lit("salary"),col("salary"),
                                        lit("Location"),col("location"))).show(truncate=False)

+-----+---------+------+--------+---------------------------------+
|id   |dept     |salary|location|propertiesMap                    |
+-----+---------+------+--------+---------------------------------+
|36636|Finance  |3000  |USA     |{salary -> 3000, Location -> USA}|
|40288|Finance  |5000  |IND     |{salary -> 5000, Location -> IND}|
|42114|Sales    |3900  |USA     |{salary -> 3900, Location -> USA}|
|39192|Marketing|2500  |CAN     |{salary -> 2500, Location -> CAN}|
|34534|Sales    |6500  |USA     |{salary -> 6500, Location -> USA}|
+-----+---------+------+--------+---------------------------------+



In [60]:
df.withColumn("propertiesMap",create_map(lit("salary"),col("salary"),
                                        lit("Location"),col("location"))).drop("location","salary").show(truncate=False)

+-----+---------+---------------------------------+
|id   |dept     |propertiesMap                    |
+-----+---------+---------------------------------+
|36636|Finance  |{salary -> 3000, Location -> USA}|
|40288|Finance  |{salary -> 5000, Location -> IND}|
|42114|Sales    |{salary -> 3900, Location -> USA}|
|39192|Marketing|{salary -> 2500, Location -> CAN}|
|34534|Sales    |{salary -> 6500, Location -> USA}|
+-----+---------+---------------------------------+



# map_keys() – 
* Get All Map Keys

In [4]:
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]
df = spark.createDataFrame(data=dataDictionary, schema = ["Name","properties"])
df.show(truncate=False)

+----------+-----------------------------+
|Name      |properties                   |
+----------+-----------------------------+
|James     |{eye -> brown, hair -> black}|
|Michael   |{eye -> null, hair -> brown} |
|Robert    |{eye -> black, hair -> red}  |
|Washington|{eye -> grey, hair -> grey}  |
|Jefferson |{eye -> , hair -> brown}     |
+----------+-----------------------------+



In [5]:
df.select(df.Name,map_keys(df.properties)).show()

+----------+--------------------+
|      Name|map_keys(properties)|
+----------+--------------------+
|     James|         [eye, hair]|
|   Michael|         [eye, hair]|
|    Robert|         [eye, hair]|
|Washington|         [eye, hair]|
| Jefferson|         [eye, hair]|
+----------+--------------------+



# Map_Values()

In [6]:
df.select(df.Name,map_values(df.properties)).show()

+----------+----------------------+
|      Name|map_values(properties)|
+----------+----------------------+
|     James|        [brown, black]|
|   Michael|         [null, brown]|
|    Robert|          [black, red]|
|Washington|          [grey, grey]|
| Jefferson|             [, brown]|
+----------+----------------------+



# Struct()
* we can change the struct of the existing DataFrame and add a new StructType to it.

In [10]:

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.show(truncate=False)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



In [20]:
updatedDF = df2.withColumn("OtherInfo",
           struct(col("ID").alias("Identity"),
                 col("gender").alias("SEX"),
                  col("salary").alias("Kamai"),
                 when(col("salary").cast(IntegerType())<2000,"LOW")
                  .when(col("salary").cast(IntegerType())< 4000,"MEDIUM")
                  .otherwise("HIGH").alias("Grade"))).drop("id","gender","salary").show(truncate=False)

+--------------------+------------------------+
|name                |OtherInfo               |
+--------------------+------------------------+
|{James, , Smith}    |{36636, M, 3100, MEDIUM}|
|{Michael, Rose, }   |{40288, M, 4300, HIGH}  |
|{Robert, , Williams}|{42114, M, 1400, LOW}   |
|{Maria, Anne, Jones}|{39192, F, 5500, HIGH}  |
|{Jen, Mary, Brown}  |{, F, -1, LOW}          |
+--------------------+------------------------+



## row_number Window Function
* row_number() window function is used to give the sequential row number starting from 1 to the result of each window partition.


In [21]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show(truncate=False)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [23]:
from pyspark.sql.window import Window
ws=Window.partitionBy("department").orderBy("salary")
df.withColumn("row_number",row_number().over(ws)).show(truncate=False)

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|Maria        |Finance   |3000  |1         |
|Scott        |Finance   |3300  |2         |
|Jen          |Finance   |3900  |3         |
|Kumar        |Marketing |2000  |1         |
|Jeff         |Marketing |3000  |2         |
|James        |Sales     |3000  |1         |
|James        |Sales     |3000  |2         |
|Robert       |Sales     |4100  |3         |
|Saif         |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
+-------------+----------+------+----------+



# rank Window Function
* rank() window function is used to provide a rank to the result within a window partition.

In [24]:
df.withColumn("rank",rank().over(ws)).show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
+-------------+----------+------+----+



# dense_rank Window Function
* dense_rank() window function is used to get the result with rank of rows within a window partition without any gaps. This is similar to rank() function difference being rank function leaves gaps in rank when there are ties.



In [25]:
df.withColumn("rank",dense_rank().over(ws)).show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   2|
|         Saif|     Sales|  4100|   2|
|      Michael|     Sales|  4600|   3|
+-------------+----------+------+----+



## percent_rank Window Function

In [27]:
df.withColumn("percent_rank",percent_rank().over(ws)) \
    .show()

+-------------+----------+------+------------+
|employee_name|department|salary|percent_rank|
+-------------+----------+------+------------+
|        Maria|   Finance|  3000|         0.0|
|        Scott|   Finance|  3300|         0.5|
|          Jen|   Finance|  3900|         1.0|
|        Kumar| Marketing|  2000|         0.0|
|         Jeff| Marketing|  3000|         1.0|
|        James|     Sales|  3000|         0.0|
|        James|     Sales|  3000|         0.0|
|       Robert|     Sales|  4100|         0.5|
|         Saif|     Sales|  4100|         0.5|
|      Michael|     Sales|  4600|         1.0|
+-------------+----------+------+------------+



# ntile Window Function
* ntile() window function returns the relative rank of result rows within a window partition. In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2)



In [29]:
df.withColumn("ntile",ntile(2).over(ws)).show()

+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    1|
|          Jen|   Finance|  3900|    2|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    1|
|         Saif|     Sales|  4100|    2|
|      Michael|     Sales|  4600|    2|
+-------------+----------+------+-----+



In [31]:
df.withColumn("ntile",ntile(30).over(ws)).show()

+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    2|
|          Jen|   Finance|  3900|    3|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    2|
|       Robert|     Sales|  4100|    3|
|         Saif|     Sales|  4100|    4|
|      Michael|     Sales|  4600|    5|
+-------------+----------+------+-----+



# lag Window Function

In [33]:
df.withColumn("lag",lag("salary",2).over(ws)) \
      .show()

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|null|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
|        James|     Sales|  3000|null|
|        James|     Sales|  3000|null|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
+-------------+----------+------+----+



# lead Window Function

In [34]:
df.withColumn("lead",lead("salary",2).over(ws)) \
    .show()

+-------------+----------+------+----+
|employee_name|department|salary|lead|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|3900|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|null|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
|        James|     Sales|  3000|4100|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|         Saif|     Sales|  4100|null|
|      Michael|     Sales|  4600|null|
+-------------+----------+------+----+

