### 建立包

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf()
conf.set("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
conf.set("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w
from pyspark.sql.functions import pandas_udf, PandasUDFType
import numpy as np
import datetime

### 建立测试数据

In [4]:
sequence = [1, 2, 4, 7, 11, 16, 22]
sequence2 = [3, 5, 6, 8, 10, 18, 20]

# create two example time series with different ticks and inconsistent spacing
df = spark.createDataFrame(
  list(zip(
    ['series 1'] * len(sequence),
    [datetime.datetime(2021,1,1,0,i) for i in sequence],
    [i**2*np.random.uniform(0,1) for i in sequence]
  ))
  + list(zip(
    ['series 2'] * len(sequence2),
    [datetime.datetime(2021,1,1,0,i) for i in sequence2],
    [i**2*np.random.uniform(0,1) for i in sequence2]
  )),
  schema=t.StructType([
    t.StructField('id', t.StringType()),
    t.StructField('time', t.TimestampType()),
    t.StructField('value', t.FloatType())
  ])
)

In [5]:
df.show()

+--------+-------------------+-----------+
|      id|               time|      value|
+--------+-------------------+-----------+
|series 1|2021-01-01 00:01:00|  0.2630323|
|series 1|2021-01-01 00:02:00|   2.379535|
|series 1|2021-01-01 00:04:00|    8.14125|
|series 1|2021-01-01 00:07:00|  12.041094|
|series 1|2021-01-01 00:11:00|  43.983074|
|series 1|2021-01-01 00:16:00|  100.91391|
|series 1|2021-01-01 00:22:00|  240.42404|
|series 2|2021-01-01 00:03:00|  5.8425837|
|series 2|2021-01-01 00:05:00|   9.124744|
|series 2|2021-01-01 00:06:00|   8.709988|
|series 2|2021-01-01 00:08:00|0.100244395|
|series 2|2021-01-01 00:10:00|  33.438625|
|series 2|2021-01-01 00:18:00|  51.383762|
|series 2|2021-01-01 00:20:00|  24.027822|
+--------+-------------------+-----------+



### 对数据清洗，产出每间隔一分钟的数据，拼接之前数据，使得时间不会缺少

In [6]:
# up sample the time series to regular tickspacing
regular_ticks = (
  df.groupBy('id')
  .agg(
    f.min('time').alias('min_time'),
    f.max('time').alias('max_time'),
  )
  .withColumn('time', f.explode(f.sequence('min_time', 'max_time', f.lit('1 minute').cast('Interval'))))
  .drop('max_time', 'min_time')
  .join(df, ['id', 'time'], 'left')
)

In [8]:
regular_ticks.show(200)

+--------+-------------------+-----------+
|      id|               time|      value|
+--------+-------------------+-----------+
|series 2|2021-01-01 00:19:00|       null|
|series 1|2021-01-01 00:22:00|  240.42404|
|series 2|2021-01-01 00:16:00|       null|
|series 1|2021-01-01 00:12:00|       null|
|series 1|2021-01-01 00:14:00|       null|
|series 2|2021-01-01 00:18:00|  51.383762|
|series 2|2021-01-01 00:05:00|   9.124744|
|series 1|2021-01-01 00:19:00|       null|
|series 1|2021-01-01 00:05:00|       null|
|series 1|2021-01-01 00:10:00|       null|
|series 2|2021-01-01 00:09:00|       null|
|series 2|2021-01-01 00:20:00|  24.027822|
|series 2|2021-01-01 00:10:00|  33.438625|
|series 1|2021-01-01 00:21:00|       null|
|series 1|2021-01-01 00:13:00|       null|
|series 1|2021-01-01 00:03:00|       null|
|series 2|2021-01-01 00:06:00|   8.709988|
|series 1|2021-01-01 00:17:00|       null|
|series 2|2021-01-01 00:14:00|       null|
|series 1|2021-01-01 00:07:00|  12.041094|
|series 1|2

### 空值填充：如果遇到空值，用上一个非空值

In [9]:
# fill method 1: take last known value

fill_with_last_known = (
  regular_ticks.withColumn(
    'value_interp',
    f.last('value', ignorenulls=True)
    .over(
      w.partitionBy('id')
      .orderBy(
        f.col('time').asc_nulls_last()
      )
    )
  )
)

In [10]:
fill_with_last_known.show(200)

+--------+-------------------+-----------+------------+
|      id|               time|      value|value_interp|
+--------+-------------------+-----------+------------+
|series 2|2021-01-01 00:03:00|  5.8425837|   5.8425837|
|series 2|2021-01-01 00:04:00|       null|   5.8425837|
|series 2|2021-01-01 00:05:00|   9.124744|    9.124744|
|series 2|2021-01-01 00:06:00|   8.709988|    8.709988|
|series 2|2021-01-01 00:07:00|       null|    8.709988|
|series 2|2021-01-01 00:08:00|0.100244395| 0.100244395|
|series 2|2021-01-01 00:09:00|       null| 0.100244395|
|series 2|2021-01-01 00:10:00|  33.438625|   33.438625|
|series 2|2021-01-01 00:11:00|       null|   33.438625|
|series 2|2021-01-01 00:12:00|       null|   33.438625|
|series 2|2021-01-01 00:13:00|       null|   33.438625|
|series 2|2021-01-01 00:14:00|       null|   33.438625|
|series 2|2021-01-01 00:15:00|       null|   33.438625|
|series 2|2021-01-01 00:16:00|       null|   33.438625|
|series 2|2021-01-01 00:17:00|       null|   33.

### 线性插值

In [11]:
# fill method 2: linear interpolation

def interpolate(timestamp_col, method={'method': 'fill_zero'}, **kwargs):
    """
    Utility function to interpolate missing values in a timeseries on grouped object, assume group by key is unique id of time series
    also assumes that the timestamp col is regularly spaced already

    schema: output schema of the dataframe
    timestamp_col: column name of timestamp
    method: <map> Supported methods: fill_zero, pad (fill forward), linear, nearest, and other scipy methods

    example usage
    month_series.groupBy(series_id).apply(interpolate(month_series.schema, "month", {'method': 'fill_zero'}))
    series_id here refers to the key/keys of the time series WITHOUT the time column
    """
    def _(pdf):
        pdf.set_index(timestamp_col, inplace=True)
        pdf.sort_index(axis=0, inplace=True)
        if method['method'] == 'fill_zero':
            pdf.fillna(0, inplace=True)
        else:
            pdf.interpolate(**method, inplace=True)
            pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        return pdf
    return _

linear_interpolate = (
  regular_ticks.groupBy('id')
  .applyInPandas(
    interpolate('time', {'method': 'linear'}),
    regular_ticks.schema
  )
)

In [12]:
linear_interpolate.show(200)

+--------+-------------------+-----------+
|      id|               time|      value|
+--------+-------------------+-----------+
|series 2|2021-01-01 00:03:00|  5.8425837|
|series 2|2021-01-01 00:04:00|   7.483664|
|series 2|2021-01-01 00:05:00|   9.124744|
|series 2|2021-01-01 00:06:00|   8.709988|
|series 2|2021-01-01 00:07:00|   4.405116|
|series 2|2021-01-01 00:08:00|0.100244395|
|series 2|2021-01-01 00:09:00|  16.769434|
|series 2|2021-01-01 00:10:00|  33.438625|
|series 2|2021-01-01 00:11:00|  35.681767|
|series 2|2021-01-01 00:12:00|   37.92491|
|series 2|2021-01-01 00:13:00|  40.168053|
|series 2|2021-01-01 00:14:00|  42.411194|
|series 2|2021-01-01 00:15:00|  44.654335|
|series 2|2021-01-01 00:16:00|  46.897476|
|series 2|2021-01-01 00:17:00|   49.14062|
|series 2|2021-01-01 00:18:00|  51.383762|
|series 2|2021-01-01 00:19:00|   37.70579|
|series 2|2021-01-01 00:20:00|  24.027822|
|series 1|2021-01-01 00:01:00|  0.2630323|
|series 1|2021-01-01 00:02:00|   2.379535|
|series 1|2

### 二次方插值

In [13]:
#fill method 3: quadratic interpolation
quadratic_interpolate = (
  regular_ticks.groupBy('id')
  .applyInPandas(
    interpolate('time', {'method': 'quadratic'}),
    regular_ticks.schema
  )
)

In [14]:
quadratic_interpolate.show()

+--------+-------------------+-----------+
|      id|               time|      value|
+--------+-------------------+-----------+
|series 2|2021-01-01 00:03:00|  5.8425837|
|series 2|2021-01-01 00:04:00|   7.918719|
|series 2|2021-01-01 00:05:00|   9.124744|
|series 2|2021-01-01 00:06:00|   8.709988|
|series 2|2021-01-01 00:07:00|  2.1704123|
|series 2|2021-01-01 00:08:00|0.100244395|
|series 2|2021-01-01 00:09:00|  13.844383|
|series 2|2021-01-01 00:10:00|  33.438625|
|series 2|2021-01-01 00:11:00|   48.91877|
|series 2|2021-01-01 00:12:00|  60.284813|
|series 2|2021-01-01 00:13:00|   67.53676|
|series 2|2021-01-01 00:14:00|   70.67461|
|series 2|2021-01-01 00:15:00|  70.279526|
|series 2|2021-01-01 00:16:00|  66.932686|
|series 2|2021-01-01 00:17:00|    60.6341|
|series 2|2021-01-01 00:18:00|  51.383762|
|series 2|2021-01-01 00:19:00|  39.181667|
|series 2|2021-01-01 00:20:00|  24.027822|
|series 1|2021-01-01 00:01:00|  0.2630323|
|series 1|2021-01-01 00:02:00|   2.379535|
+--------+-