In [77]:
import findspark



In [78]:
findspark.init()

In [1]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import SparkSession,Window,DataFrame
import datetime
from pyspark.sql.functions import  col, lit, udf, datediff, lead, explode
from pyspark.sql.types import StringType,BooleanType,DateType,LongType,ArrayType
from typing import List
from pyspark.sql.functions import to_date 

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [12]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
df = spark.createDataFrame(
    [("11251991123445",), ("11241991123445",), ("11301991123445",)], 
    ['date_str']
)

df2 = df.select(
    'date_str', 
    from_unixtime(unix_timestamp('date_str', 'MMddyyyyhhmmss'))[0:10].alias('date')
)

In [6]:
df2.printSchema()

root
 |-- date_str: string (nullable = true)
 |-- date: string (nullable = true)



In [13]:
df2.show()

+--------------+----------+
|      date_str|      date|
+--------------+----------+
|11251991123445|1991-11-25|
|11241991123445|1991-11-24|
|11301991123445|1991-11-30|
+--------------+----------+



In [80]:
simpleData = [("01-01-2006",1),
    ("02-01-2006",2),
    ("04-01-2006",3),
    ("05-01-2006",4),
    ("07-01-2006",5),
  ]

columns = ["jobStartDate","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

# df2 = df.withColumn("jobStartDate",col("jobStartDate").cast(DateType())) \
#     .withColumn("salary",col("salary").cast(LongType()))

# df2.printSchema()
# df2.show(truncate=False)

df1 = df.select(to_date(df.jobStartDate, 'dd-mm-yyyy').alias('jobStartDate'),"salary")
df1.printSchema()
df1.show(truncate=False)

root
 |-- jobStartDate: string (nullable = true)
 |-- salary: long (nullable = true)

+------------+------+
|jobStartDate|salary|
+------------+------+
|01-01-2006  |1     |
|02-01-2006  |2     |
|04-01-2006  |3     |
|05-01-2006  |4     |
|07-01-2006  |5     |
+------------+------+

root
 |-- jobStartDate: date (nullable = true)
 |-- salary: long (nullable = true)

+------------+------+
|jobStartDate|salary|
+------------+------+
|2006-01-01  |1     |
|2006-01-02  |2     |
|2006-01-04  |3     |
|2006-01-05  |4     |
|2006-01-07  |5     |
+------------+------+



In [81]:
def _get_next_dates(start_date, diff):
    return [start_date + datetime.timedelta(days=days) for days in range(1, diff)]

get_next_dates_udf = udf(_get_next_dates, ArrayType(DateType()))

window = Window.orderBy(*[], "jobStartDate")
df3 = df1.withColumn("_diff", datediff(lead("jobStartDate", 1).over(window), "jobStartDate"))
 
df2 = df1.withColumn("_diff", datediff(lead("jobStartDate", 1).over(window), "jobStartDate")) \
    .filter(col("_diff") > 1) \
    .withColumn("_next_dates", get_next_dates_udf("jobStartDate", "_diff")) \
   .withColumn("salary", lit("0")).withColumn("jobStartDate", explode("_next_dates")) \
   .drop("_diff", "_next_dates")

In [82]:
df2.show()

+------------+------+
|jobStartDate|salary|
+------------+------+
|  2006-01-03|     0|
|  2006-01-06|     0|
+------------+------+



In [83]:
df3.show()

+------------+------+-----+
|jobStartDate|salary|_diff|
+------------+------+-----+
|  2006-01-01|     1|    1|
|  2006-01-02|     2|    2|
|  2006-01-04|     3|    1|
|  2006-01-05|     4|    2|
|  2006-01-07|     5| null|
+------------+------+-----+

