In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("SQL queries").getOrCreate()

In [4]:
df = spark.read.format("csv").option("header","false").option("inferSchema","true").load("utilization.csv")
df.show(10)

+-------------------+---+----+----+---+
|                _c0|_c1| _c2| _c3|_c4|
+-------------------+---+----+----+---+
|03/05/2019 08:06:14|100|0.57|0.51| 47|
|03/05/2019 08:11:14|100|0.47|0.62| 43|
|03/05/2019 08:16:14|100|0.56|0.57| 62|
|03/05/2019 08:21:14|100|0.57|0.56| 50|
|03/05/2019 08:26:14|100|0.35|0.46| 43|
|03/05/2019 08:31:14|100|0.41|0.58| 48|
|03/05/2019 08:36:14|100|0.57|0.35| 58|
|03/05/2019 08:41:14|100|0.41| 0.4| 58|
|03/05/2019 08:46:14|100|0.53|0.35| 62|
|03/05/2019 08:51:14|100|0.51| 0.6| 45|
+-------------------+---+----+----+---+
only showing top 10 rows



In [5]:
df = df.withColumnRenamed("_c0","event_datetime") \
       .withColumnRenamed("_c1","server_id") \
       .withColumnRenamed("_c2", "free_memory") \
        .withColumnRenamed("_c3","cpu_utilization") \
        .withColumnRenamed("_c4", "session_count")

In [6]:
df.show(10)

+-------------------+---------+-----------+---------------+-------------+
|     event_datetime|server_id|free_memory|cpu_utilization|session_count|
+-------------------+---------+-----------+---------------+-------------+
|03/05/2019 08:06:14|      100|       0.57|           0.51|           47|
|03/05/2019 08:11:14|      100|       0.47|           0.62|           43|
|03/05/2019 08:16:14|      100|       0.56|           0.57|           62|
|03/05/2019 08:21:14|      100|       0.57|           0.56|           50|
|03/05/2019 08:26:14|      100|       0.35|           0.46|           43|
|03/05/2019 08:31:14|      100|       0.41|           0.58|           48|
|03/05/2019 08:36:14|      100|       0.57|           0.35|           58|
|03/05/2019 08:41:14|      100|       0.41|            0.4|           58|
|03/05/2019 08:46:14|      100|       0.53|           0.35|           62|
|03/05/2019 08:51:14|      100|       0.51|            0.6|           45|
+-------------------+---------+-------

In [8]:
df.describe().show()

+-------+-------------------+------------------+-------------------+-------------------+------------------+
|summary|     event_datetime|         server_id|        free_memory|    cpu_utilization|     session_count|
+-------+-------------------+------------------+-------------------+-------------------+------------------+
|  count|             500000|            500000|             500000|             500000|            500000|
|   mean|               null|             124.5| 0.6205177399999874|  0.379128099999989|          69.59616|
| stddev|               null|14.430884120553118|0.15875173872912948|0.15830931278376223|14.850676696352853|
|    min|03/05/2019 08:06:14|               100|               0.22|                0.0|                32|
|    max|04/09/2019 01:22:46|               149|                1.0|               0.78|               105|
+-------+-------------------+------------------+-------------------+-------------------+------------------+



In [9]:
df.stat.corr('cpu_utilization','free_memory')

-0.4704771573080703

In [11]:
df.stat.freqItems(['server_id','session_count']).show()

+--------------------+-----------------------+
| server_id_freqItems|session_count_freqItems|
+--------------------+-----------------------+
|[146, 137, 101, 1...|   [92, 101, 83, 104...|
+--------------------+-----------------------+



In [13]:
df.createOrReplaceTempView('utilization')

In [16]:
spark.sql('select server_id, min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) from utilization group by server_id').show()

+---------+--------------------+--------------------+----------------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev_samp(cpu_utilization)|
+---------+--------------------+--------------------+----------------------------+
|      148|                0.06|                0.46|         0.11526861409915351|
|      137|                0.06|                0.46|         0.11545515304323323|
|      133|                0.05|                0.44|         0.11550985292871052|
|      108|                0.05|                0.45|         0.11611846086516736|
|      101|                 0.0|                 0.4|          0.1145876847560405|
|      115|                0.16|                0.56|         0.11532173368982139|
|      126|                0.12|                0.52|          0.1152584492131119|
|      103|                0.04|                0.44|         0.11533907440762893|
|      128|                0.22|                0.62|         0.11583499380834354|
|   

In [17]:
spark.sql('select server_id, floor(cpu_utilization*100/10) bucket from utilization').show()

+---------+------+
|server_id|bucket|
+---------+------+
|      100|     5|
|      100|     6|
|      100|     5|
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     3|
|      100|     4|
|      100|     3|
|      100|     6|
|      100|     3|
|      100|     5|
|      100|     7|
|      100|     5|
|      100|     4|
|      100|     6|
|      100|     6|
|      100|     5|
|      100|     6|
|      100|     5|
+---------+------+
only showing top 20 rows



In [18]:
spark.sql('select count(*),floor(cpu_utilization*100/10) bucket from utilization group by bucket order by bucket').show()

+--------+------+
|count(1)|bucket|
+--------+------+
|   15701|     0|
|   54184|     1|
|   85082|     2|
|  114940|     3|
|  109069|     4|
|   70822|     5|
|   40385|     6|
|    9817|     7|
+--------+------+



In [21]:
sql_window = spark.sql('select event_datetime,server_id,cpu_utilization, avg(cpu_utilization) over (partition by server_id) avg_server_util,(cpu_utilization -  avg(cpu_utilization) over (partition by server_id) )delta_server_util from utilization')
sql_window.show()

+-------------------+---------+---------------+------------------+--------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|   delta_server_util|
+-------------------+---------+---------------+------------------+--------------------+
|03/05/2019 08:07:41|      148|           0.23|0.2612669999999993|-0.03126699999999...|
|03/05/2019 08:12:41|      148|           0.27|0.2612669999999993|0.008733000000000712|
|03/05/2019 08:17:41|      148|            0.2|0.2612669999999993|-0.06126699999999...|
|03/05/2019 08:22:41|      148|           0.11|0.2612669999999993|-0.15126699999999932|
|03/05/2019 08:27:41|      148|           0.32|0.2612669999999993|  0.0587330000000007|
|03/05/2019 08:32:41|      148|           0.34|0.2612669999999993| 0.07873300000000072|
|03/05/2019 08:37:41|      148|           0.16|0.2612669999999993| -0.1012669999999993|
|03/05/2019 08:42:41|      148|           0.44|0.2612669999999993|  0.1787330000000007|
|03/05/2019 08:47:41|      148| 

In [22]:
sql_window2 = spark.sql("select event_datetime, server_id, cpu_utilization, avg(cpu_utilization) over (partition by server_id order by event_datetime rows between 1 preceding and 1 following) avg_server_util from utilization")
sql_window2.show()

+-------------------+---------+---------------+-------------------+
|     event_datetime|server_id|cpu_utilization|    avg_server_util|
+-------------------+---------+---------------+-------------------+
|03/05/2019 08:07:41|      148|           0.23|               0.25|
|03/05/2019 08:12:41|      148|           0.27| 0.2333333333333333|
|03/05/2019 08:17:41|      148|            0.2|0.19333333333333336|
|03/05/2019 08:22:41|      148|           0.11|               0.21|
|03/05/2019 08:27:41|      148|           0.32|0.25666666666666665|
|03/05/2019 08:32:41|      148|           0.34| 0.2733333333333334|
|03/05/2019 08:37:41|      148|           0.16| 0.3133333333333333|
|03/05/2019 08:42:41|      148|           0.44|               0.25|
|03/05/2019 08:47:41|      148|           0.15|0.33666666666666667|
|03/05/2019 08:52:41|      148|           0.42|0.25666666666666665|
|03/05/2019 08:57:41|      148|            0.2| 0.2833333333333333|
|03/05/2019 09:02:41|      148|           0.23|0

In [23]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [25]:
df.show()

+-------------------+---------+-----------+---------------+-------------+
|     event_datetime|server_id|free_memory|cpu_utilization|session_count|
+-------------------+---------+-----------+---------------+-------------+
|03/05/2019 08:06:14|      100|       0.57|           0.51|           47|
|03/05/2019 08:11:14|      100|       0.47|           0.62|           43|
|03/05/2019 08:16:14|      100|       0.56|           0.57|           62|
|03/05/2019 08:21:14|      100|       0.57|           0.56|           50|
|03/05/2019 08:26:14|      100|       0.35|           0.46|           43|
|03/05/2019 08:31:14|      100|       0.41|           0.58|           48|
|03/05/2019 08:36:14|      100|       0.57|           0.35|           58|
|03/05/2019 08:41:14|      100|       0.41|            0.4|           58|
|03/05/2019 08:46:14|      100|       0.53|           0.35|           62|
|03/05/2019 08:51:14|      100|       0.51|            0.6|           45|
|03/05/2019 08:56:14|      100|       

In [24]:
vector_Assembler = VectorAssembler(inputCols=['cpu_utilization','free_memory','session_count'], outputCol='features')

In [27]:
vcluster_df = vector_Assembler.transform(df)
vcluster_df.show()

+-------------------+---------+-----------+---------------+-------------+----------------+
|     event_datetime|server_id|free_memory|cpu_utilization|session_count|        features|
+-------------------+---------+-----------+---------------+-------------+----------------+
|03/05/2019 08:06:14|      100|       0.57|           0.51|           47|[0.51,0.57,47.0]|
|03/05/2019 08:11:14|      100|       0.47|           0.62|           43|[0.62,0.47,43.0]|
|03/05/2019 08:16:14|      100|       0.56|           0.57|           62|[0.57,0.56,62.0]|
|03/05/2019 08:21:14|      100|       0.57|           0.56|           50|[0.56,0.57,50.0]|
|03/05/2019 08:26:14|      100|       0.35|           0.46|           43|[0.46,0.35,43.0]|
|03/05/2019 08:31:14|      100|       0.41|           0.58|           48|[0.58,0.41,48.0]|
|03/05/2019 08:36:14|      100|       0.57|           0.35|           58|[0.35,0.57,58.0]|
|03/05/2019 08:41:14|      100|       0.41|            0.4|           58| [0.4,0.41,58.0]|

In [29]:
kmeans = KMeans().setK(3)

In [30]:
kmeans = kmeans.setSeed(1)

In [31]:
kmodel = kmeans.fit(vcluster_df)

In [32]:
kmodel.clusterCenters()

[array([ 0.47836303,  0.52047775, 51.79927162]),
 array([ 0.28104316,  0.71931575, 88.23965784]),
 array([ 0.37094643,  0.62881549, 70.43030159])]

In [33]:
from pyspark.ml.regression import LinearRegression

In [34]:
vector_Assembler = VectorAssembler(inputCols=['cpu_utilization'], outputCol='features')

In [36]:
vreg_df = vector_Assembler.transform(df)
vreg_df.show()

+-------------------+---------+-----------+---------------+-------------+--------+
|     event_datetime|server_id|free_memory|cpu_utilization|session_count|features|
+-------------------+---------+-----------+---------------+-------------+--------+
|03/05/2019 08:06:14|      100|       0.57|           0.51|           47|  [0.51]|
|03/05/2019 08:11:14|      100|       0.47|           0.62|           43|  [0.62]|
|03/05/2019 08:16:14|      100|       0.56|           0.57|           62|  [0.57]|
|03/05/2019 08:21:14|      100|       0.57|           0.56|           50|  [0.56]|
|03/05/2019 08:26:14|      100|       0.35|           0.46|           43|  [0.46]|
|03/05/2019 08:31:14|      100|       0.41|           0.58|           48|  [0.58]|
|03/05/2019 08:36:14|      100|       0.57|           0.35|           58|  [0.35]|
|03/05/2019 08:41:14|      100|       0.41|            0.4|           58|   [0.4]|
|03/05/2019 08:46:14|      100|       0.53|           0.35|           62|  [0.35]|
|03/

In [37]:
lr = LinearRegression(featuresCol='features',labelCol='session_count')

In [38]:
lr_model = lr.fit(vreg_df)

In [39]:
lr_model.coefficients

DenseVector([-46.982])

In [40]:
lr_model.intercept

87.40837372965713

In [42]:
lr_model.summary.rootMeanSquaredError

12.853908178018267