In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('TimeSeries Analysis with PySpark').getOrCreate()

In [None]:
data_path = '/content/drive/MyDrive/Data/'
file_path = data_path + 'utilization.json'
df = spark.read.json(file_path)
df.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [None]:
df.createOrReplaceTempView('utilization')

In [None]:
spark.sql('SELECT min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
            FROM utilization \
            GROUP BY server_id').show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.52|                0.92|    0.11528867845082576|
|                0.58|                0.98|    0.11544345150353694|
|                0.48|                0.88|    0.11542612970702058|
|                0.35|                0.75|    0.11533251724450215|
|                0.22|                0.62|    0.11516031929842008|
|                 0.3|                 0.7|    0.11506079722349302|
|                0.24|                0.64|    0.11579377614906383|
|                0.45|                0.85|    0.11597417369783877|
|                0.56|                0.96|    0.11617507884178278|
|                0.33|                0.73|    0.11510268816097273|
|                0.44|                0.84|    0.11569664615014985|
|                0.51|                0.91|    0

In [None]:
window_spark = spark.sql('SELECT server_id, event_datetime, cpu_utilization, \
                          avg(cpu_utilization) OVER(PARTITION BY server_id) avg_cpu_util \
                          FROM utilization')
window_spark.show()

+---------+-------------------+---------------+------------------+
|server_id|     event_datetime|cpu_utilization|      avg_cpu_util|
+---------+-------------------+---------------+------------------+
|      103|03/05/2019 08:06:19|           0.64|0.7614389999999969|
|      103|03/05/2019 08:11:19|           0.76|0.7614389999999969|
|      103|03/05/2019 08:16:19|           0.61|0.7614389999999969|
|      103|03/05/2019 08:21:19|           0.95|0.7614389999999969|
|      103|03/05/2019 08:26:19|           0.87|0.7614389999999969|
|      103|03/05/2019 08:31:19|           0.61|0.7614389999999969|
|      103|03/05/2019 08:36:19|            0.8|0.7614389999999969|
|      103|03/05/2019 08:41:19|           0.73|0.7614389999999969|
|      103|03/05/2019 08:46:19|           0.85|0.7614389999999969|
|      103|03/05/2019 08:51:19|           0.79|0.7614389999999969|
|      103|03/05/2019 08:56:19|           0.57|0.7614389999999969|
|      103|03/05/2019 09:01:19|           0.72|0.7614389999999

In [None]:
window_spark2 = spark.sql('SELECT server_id, event_datetime, cpu_utilization, \
                          avg(cpu_utilization) OVER(PARTITION BY server_id) avg_cpu_util, \
                          cpu_utilization - avg(cpu_utilization) OVER(PARTITION BY server_id) delta_cpu_util \
                          FROM utilization')
window_spark2.show()

+---------+-------------------+---------------+------------------+--------------------+
|server_id|     event_datetime|cpu_utilization|      avg_cpu_util|      delta_cpu_util|
+---------+-------------------+---------------+------------------+--------------------+
|      103|03/05/2019 08:06:19|           0.64|0.7614389999999969|-0.12143899999999686|
|      103|03/05/2019 08:11:19|           0.76|0.7614389999999969|-0.00143899999999...|
|      103|03/05/2019 08:16:19|           0.61|0.7614389999999969|-0.15143899999999688|
|      103|03/05/2019 08:21:19|           0.95|0.7614389999999969|  0.1885610000000031|
|      103|03/05/2019 08:26:19|           0.87|0.7614389999999969| 0.10856100000000313|
|      103|03/05/2019 08:31:19|           0.61|0.7614389999999969|-0.15143899999999688|
|      103|03/05/2019 08:36:19|            0.8|0.7614389999999969|0.038561000000003176|
|      103|03/05/2019 08:41:19|           0.73|0.7614389999999969|-0.03143899999999...|
|      103|03/05/2019 08:46:19| 

In [None]:
window_spark3 = spark.sql('SELECT event_datetime, server_id, cpu_utilization,\
                          avg(cpu_utilization) OVER(PARTITION BY server_id ORDER BY event_datetime \
                                               ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) avg_server_util \
                          FROM  utilization')
window_spark3.show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:19|      103|           0.64|0.6699999999999999|
|03/05/2019 08:11:19|      103|           0.76|              0.74|
|03/05/2019 08:16:19|      103|           0.61|             0.766|
|03/05/2019 08:21:19|      103|           0.95|              0.76|
|03/05/2019 08:26:19|      103|           0.87|             0.768|
|03/05/2019 08:31:19|      103|           0.61|0.7919999999999999|
|03/05/2019 08:36:19|      103|            0.8|             0.772|
|03/05/2019 08:41:19|      103|           0.73|             0.756|
|03/05/2019 08:46:19|      103|           0.85|             0.748|
|03/05/2019 08:51:19|      103|           0.79|             0.732|
|03/05/2019 08:56:19|      103|           0.57|             0.726|
|03/05/2019 09:01:19|      103|           0.72|0.7300000000000