In [11]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Data Analysis with PySpark').getOrCreate()

In [4]:
data_path = '/content/drive/MyDrive/Data/'
file_path = data_path + 'utilization.json'
df = spark.read.json(file_path)
df.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [5]:
df.createOrReplaceTempView('utilization')

In [6]:
df.describe().show()

+-------+------------------+-------------------+-------------------+------------------+-----------------+
|summary|   cpu_utilization|     event_datetime|        free_memory|         server_id|    session_count|
+-------+------------------+-------------------+-------------------+------------------+-----------------+
|  count|            500000|             500000|             500000|            500000|           500000|
|   mean|0.6205177399999616|               NULL|0.37912809999999375|             124.5|         69.59616|
| stddev|0.1587517387291305|               NULL|0.15830931278376148|14.430884120552617|14.85067669635284|
|    min|              0.22|03/05/2019 08:06:14|                0.0|               100|               32|
|    max|               1.0|04/09/2019 01:22:46|               0.78|               149|              105|
+-------+------------------+-------------------+-------------------+------------------+-----------------+



In [7]:
df.stat.corr("cpu_utilization", "free_memory")

-0.47047715730807493

In [8]:
df.stat.freqItems(('server_id','session_count')).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[146, 137, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [9]:
df_sample = df.sample(0.05, False)
df_sample.count()

24842

In [10]:
spark.sql('SELECT min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) FROM utilization').show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.22|                 1.0|     0.1587517387291305|
+--------------------+--------------------+-----------------------+



In [12]:
spark.sql('SELECT server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
          FROM utilization \
          GROUP BY server_id').show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      112|                0.52|                0.92|    0.11528867845082576|
|      113|                0.58|                0.98|    0.11544345150353694|
|      126|                0.48|                0.88|    0.11542612970702058|
|      110|                0.35|                0.75|    0.11533251724450215|
|      119|                0.22|                0.62|    0.11516031929842008|
|      116|                 0.3|                 0.7|    0.11506079722349302|
|      124|                0.24|                0.64|    0.11579377614906383|
|      107|                0.45|                0.85|    0.11597417369783877|
|      103|                0.56|                0.96|    0.11617507884178278|
|      114|                0.33|                0.73|    0.11510

In [13]:
spark.sql('SELECT count(*), FLOOR(cpu_utilization*100/10) bucket \
          FROM utilization \
          GROUP BY bucket \
          ORDER BY bucket').show()

+--------+------+
|count(1)|bucket|
+--------+------+
|    8186|     2|
|   37029|     3|
|   68046|     4|
|  104910|     5|
|  116725|     6|
|   88242|     7|
|   56598|     8|
|   20207|     9|
|      57|    10|
+--------+------+

