## 取当前日期 current_date

In [1]:
from pyspark.sql.functions import current_date

spark.range(3).withColumn('date', current_date()).show()

+---+----------+
| id|      date|
+---+----------+
|  0|2021-08-16|
|  1|2021-08-16|
|  2|2021-08-16|
+---+----------+



## 获取当前日期和时间 current_timestamp

In [2]:
from pyspark.sql.functions import current_timestamp

spark.range(3).withColumn('date', current_timestamp()).show()

+---+--------------------+
| id|                date|
+---+--------------------+
|  0|2021-08-16 10:15:...|
|  1|2021-08-16 10:15:...|
|  2|2021-08-16 10:15:...|
+---+--------------------+



## 日期格式转换 date_format

In [3]:
from pyspark.sql.functions import date_format

df = spark.createDataFrame([('2015-04-08', )], ['a'])

df.select(date_format('a', 'MM/dd/yyy').alias('date')).show()

+----------+
|      date|
+----------+
|04/08/2015|
+----------+



## 字符转日期to_date

In [4]:
from pyspark.sql.functions import to_date, to_timestamp

# 1.转日期
df = spark.createDataFrame([('1997-02-28 10:30:00', )], ['t'])
df.select(to_date(df.t).alias('date')).show()
# [Row(date=datetime.date(1997, 2, 28))]
# 2.带时间的日期
df = spark.createDataFrame([('1997-02-28 10:30:00', )], ['t'])
df.select(to_timestamp(df.t).alias('dt')).show()
# [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]

# 还可以指定日期格式
df = spark.createDataFrame([('1997-02-28 10:30:00', )], ['t'])
df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).show()
# [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]

+----------+
|      date|
+----------+
|1997-02-28|
+----------+

+-------------------+
|                 dt|
+-------------------+
|1997-02-28 10:30:00|
+-------------------+

+-------------------+
|                 dt|
+-------------------+
|1997-02-28 10:30:00|
+-------------------+



## 获取日期中的年月日

In [None]:
from pyspark.sql.functions import year, month, dayofmonth

df = spark.createDataFrame([('2015-04-08', )], ['a'])
df.select(
    year('a').alias('year'),
    month('a').alias('month'),
    dayofmonth('a').alias('day')).show()

## 获取时分秒

In [5]:
from pyspark.sql.functions import hour, minute, second

df = spark.createDataFrame([('2015-04-08 13:08:15', )], ['a'])
df.select(
    hour('a').alias('hour'),
    minute('a').alias('minute'),
    second('a').alias('second')).show()

+----+------+------+
|hour|minute|second|
+----+------+------+
|  13|     8|    15|
+----+------+------+



## 获取日期对应的季度

In [6]:
from pyspark.sql.functions import quarter

df = spark.createDataFrame([('2015-04-08', )], ['a'])
df.select(quarter('a').alias('quarter')).show()

+-------+
|quarter|
+-------+
|      2|
+-------+



## 日期加减date_add和date_sub

In [7]:
from pyspark.sql.functions import date_add, date_sub

df = spark.createDataFrame([('2015-04-08', )], ['d'])
df.select(date_add(df.d, 1).alias('d-add'),
          date_sub(df.d, 1).alias('d-sub')).show()

+----------+----------+
|     d-add|     d-sub|
+----------+----------+
|2015-04-09|2015-04-07|
+----------+----------+



## add_months 月份加减

In [8]:
from pyspark.sql.functions import add_months

df = spark.createDataFrame([('2015-04-08', )], ['d'])

df.select(add_months(df.d, 1).alias('d')).show()

+----------+
|         d|
+----------+
|2015-05-08|
+----------+



## 日期差,月份差

In [9]:
from pyspark.sql.functions import datediff, months_between

# 1.日期差
df = spark.createDataFrame([('2015-04-08', '2015-05-10')], ['d1', 'd2'])
df.select(datediff(df.d2, df.d1).alias('diff')).show()

# 2.月份差
df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['t', 'd'])
df.select(months_between(df.t, df.d).alias('months')).show()

+----+
|diff|
+----+
|  32|
+----+

+----------+
|    months|
+----------+
|3.94959677|
+----------+



## 计算下一个日子的日期
计算当前日期的下一个星期1,2,3,4,5,6,7的具体日子，属于实用函数

In [10]:
from pyspark.sql.functions import next_day

# "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
df = spark.createDataFrame([('2015-07-27', )], ['d'])
df.select(next_day(df.d, 'Sun').alias('date')).show()

+----------+
|      date|
+----------+
|2015-08-02|
+----------+



## 本月的最后一个日期

In [11]:
from pyspark.sql.functions import last_day

df = spark.createDataFrame([('1997-02-10', )], ['d'])
df.select(last_day(df.d).alias('date')).show()

+----------+
|      date|
+----------+
|1997-02-28|
+----------+



## 另一个示例

In [None]:
data = [["1", "2020-02-01"], ["2", "2019-03-01"], ["3", "2021-03-01"]]
df = spark.createDataFrame(data, ["id", "input"])
df.show()

#current_date()
df.select(current_date().alias("current_date")).show(1)

#date_format()
df.select(col("input"),
          date_format(col("input"), "MM-dd-yyyy").alias("date_format")).show()

#to_date()
df.select(col("input"),
          to_date(col("input"), "yyy-MM-dd").alias("to_date")).show()

#datediff()
df.select(col("input"),
          datediff(current_date(), col("input")).alias("datediff")).show()

#months_between()
df.select(col("input"),
          months_between(current_date(),
                         col("input")).alias("months_between")).show()

#trunc()
df.select(col("input"),
          trunc(col("input"), "Month").alias("Month_Trunc"),
          trunc(col("input"), "Year").alias("Month_Year"),
          trunc(col("input"), "Month").alias("Month_Trunc")).show()

#add_months() , date_add(), date_sub()

df.select(col("input"),
          add_months(col("input"), 3).alias("add_months"),
          add_months(col("input"), -3).alias("sub_months"),
          date_add(col("input"), 4).alias("date_add"),
          date_sub(col("input"), 4).alias("date_sub")).show()

#

df.select(col("input"),
          year(col("input")).alias("year"),
          month(col("input")).alias("month"),
          next_day(col("input"), "Sunday").alias("next_day"),
          weekofyear(col("input")).alias("weekofyear")).show()

df.select(
    col("input"),
    dayofweek(col("input")).alias("dayofweek"),
    dayofmonth(col("input")).alias("dayofmonth"),
    dayofyear(col("input")).alias("dayofyear"),
).show()

data = [["1", "02-01-2020 11 01 19 06"], ["2", "03-01-2019 12 01 19 406"],
        ["3", "03-01-2021 12 01 19 406"]]
df2 = spark.createDataFrame(data, ["id", "input"])
df2.show(truncate=False)

#current_timestamp()
df2.select(current_timestamp().alias("current_timestamp")).show(1,
                                                                truncate=False)

#to_timestamp()
df2.select(
    col("input"),
    to_timestamp(
        col("input"),
        "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp")).show(truncate=False)

#hour, minute,second
data = [["1", "2020-02-01 11:01:19.06"], ["2", "2019-03-01 12:01:19.406"],
        ["3", "2021-03-01 12:01:19.406"]]
df3 = spark.createDataFrame(data, ["id", "input"])

df3.select(col("input"),
           hour(col("input")).alias("hour"),
           minute(col("input")).alias("minute"),
           second(col("input")).alias("second")).show(truncate=False)