In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pivot").getOrCreate()

In [2]:
logs_data = [("DEBUG","2014-6-22 21:30:49"),
("WARN","2013-12-6 17:54:15"),
("DEBUG","2017-1-12 10:47:02"),
("DEBUG","2016-6-25 11:06:42"),
("ERROR","2015-6-28 19:25:05"),
("DEBUG","2012-6-24 01:06:37"),
("INFO","2014-12-9 09:53:54"),
("DEBUG","2015-11-8 19:20:08"),
("INFO","2017-12-21 18:34:18")]

In [3]:
logs_df = spark.createDataFrame(logs_data).toDF("loglevel","logtime")

In [5]:
logs_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG| 2014-6-22 21:30:49|
|    WARN| 2013-12-6 17:54:15|
|   DEBUG| 2017-1-12 10:47:02|
|   DEBUG| 2016-6-25 11:06:42|
|   ERROR| 2015-6-28 19:25:05|
|   DEBUG| 2012-6-24 01:06:37|
|    INFO| 2014-12-9 09:53:54|
|   DEBUG| 2015-11-8 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [6]:
logs_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: string (nullable = true)



In [7]:
from pyspark.sql.functions import *

In [8]:
log_df = logs_df.withColumn("logtime", to_timestamp("logtime"))

In [9]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [11]:
log_df.select("loglevel",date_format("logtime","MMM").alias("month") ).show()

+--------+-----+
|loglevel|month|
+--------+-----+
|   DEBUG|  Jun|
|    WARN|  Dec|
|   DEBUG|  Jan|
|   DEBUG|  Jun|
|   ERROR|  Jun|
|   DEBUG|  Jun|
|    INFO|  Dec|
|   DEBUG|  Nov|
|    INFO|  Dec|
+--------+-----+



In [20]:
df1 = log_df.select("loglevel",
                    date_format("logtime","MMM").alias("month"),
                    month("logtime").alias("month_number")) 
df2 = df1.groupBy("loglevel","month","month_number") \
        .count() \
        .withColumnRenamed("count","total_occurance") \
        .orderBy("month_number")
df2.drop("month_number").show()

+--------+-----+---------------+
|loglevel|month|total_occurance|
+--------+-----+---------------+
|   DEBUG|  Jan|              1|
|   DEBUG|  Jun|              3|
|   ERROR|  Jun|              1|
|   DEBUG|  Nov|              1|
|    WARN|  Dec|              1|
|    INFO|  Dec|              2|
+--------+-----+---------------+



In [10]:
log_df.createOrReplaceTempView("logs")

In [21]:
results_df = spark.sql("""
                    select 
                    date_format(logtime ,"MMMM") as month,
                    first(date_format(logtime ,"MM")) as month_number,
                    loglevel,
                    count(*) as total_occurance
                    from logs
                    group by month,loglevel
                    order by month_number asc
                    """)

results_df.show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|          01|   DEBUG|              1|
|    June|          06|   DEBUG|              3|
|    June|          06|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    INFO|              2|
|December|          12|    WARN|              1|
+--------+------------+--------+---------------+



In [22]:
final_df = results_df.drop("month_number")
final_df.show()

+--------+--------+---------------+
|   month|loglevel|total_occurance|
+--------+--------+---------------+
| January|   DEBUG|              1|
|    June|   DEBUG|              3|
|    June|   ERROR|              1|
|November|   DEBUG|              1|
|December|    INFO|              2|
|December|    WARN|              1|
+--------+--------+---------------+



In [None]:
df1 = log_df.select("loglevel",
                    date_format("logtime","MMM").alias("month"),
                    date_format("logtime","MM").alias("month_number")) 
df2 = df1.groupBy("loglevel","month","month_number") \
        .count() \
        .withColumnRenamed("count","total_occurance") \
        .orderBy("month_number")
df3 = df2.drop("month_number")

In [23]:
df1 = log_df.select("loglevel",
                    date_format("logtime","MMM").alias("month"),
                    month("logtime").alias("month_number")) 
df2 = df1.groupBy("loglevel","month","month_number") \
        .count() \
        .withColumnRenamed("count","total_occurance") \
        .orderBy("month_number")
df3 = df2.drop("month_number")

In [24]:
df3.show()

+--------+-----+---------------+
|loglevel|month|total_occurance|
+--------+-----+---------------+
|   DEBUG|  Jan|              1|
|   DEBUG|  Jun|              3|
|   ERROR|  Jun|              1|
|   DEBUG|  Nov|              1|
|    WARN|  Dec|              1|
|    INFO|  Dec|              2|
+--------+-----+---------------+



In [27]:
df3.groupBy("loglevel").pivot("month").sum("total_occurance").show()

+--------+----+----+----+----+
|loglevel| Dec| Jan| Jun| Nov|
+--------+----+----+----+----+
|    INFO|   2|null|null|null|
|   ERROR|null|null|   1|null|
|    WARN|   1|null|null|null|
|   DEBUG|null|   1|   3|   1|
+--------+----+----+----+----+



In [28]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG|2014-06-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [33]:
log_df.select("loglevel",date_format("logtime" , "MMMM").alias("month_name")) \
.groupBy("loglevel") \
.pivot("month_name") \
.count() \
.show()

+--------+--------+-------+----+--------+
|loglevel|December|January|June|November|
+--------+--------+-------+----+--------+
|    INFO|       2|   null|null|    null|
|   ERROR|    null|   null|   1|    null|
|    WARN|       1|   null|null|    null|
|   DEBUG|    null|      1|   3|       1|
+--------+--------+-------+----+--------+



In [34]:
log_df.select("loglevel",date_format("logtime" , "MM").alias("month_number")) \
.groupBy("loglevel") \
.pivot("month_number") \
.count() \
.show()

+--------+----+----+----+----+
|loglevel|  01|  06|  11|  12|
+--------+----+----+----+----+
|    INFO|null|null|null|   2|
|   ERROR|null|   1|null|null|
|    WARN|null|null|null|   1|
|   DEBUG|   1|   3|   1|null|
+--------+----+----+----+----+



In [29]:
spark.sql("select * from logs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG|2014-06-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [31]:
spark.sql("""select loglevel,
          date_format(logtime , "MMMM") as month_name
          from logs""") \
            .groupBy("loglevel") \
            .pivot("month_name") \
            .count() \
            .show()

+--------+--------+-------+----+--------+
|loglevel|December|January|June|November|
+--------+--------+-------+----+--------+
|    INFO|       2|   null|null|    null|
|   ERROR|    null|   null|   1|    null|
|    WARN|       1|   null|null|    null|
|   DEBUG|    null|      1|   3|       1|
+--------+--------+-------+----+--------+



In [32]:
spark.sql("""select loglevel,
          date_format(logtime , "MM") as month_number
          from logs""") \
            .groupBy("loglevel") \
            .pivot("month_number") \
            .count() \
            .show()

+--------+----+----+----+----+
|loglevel|  01|  06|  11|  12|
+--------+----+----+----+----+
|    INFO|null|null|null|   2|
|   ERROR|null|   1|null|null|
|    WARN|null|null|null|   1|
|   DEBUG|   1|   3|   1|null|
+--------+----+----+----+----+



In [35]:
month_list=["January","June","November","December"]

In [36]:
log_df.select("loglevel",date_format("logtime" , "MMMM").alias("month_name")) \
.groupBy("loglevel") \
.pivot("month_name",month_list) \
.count() \
.show()

+--------+-------+----+--------+--------+
|loglevel|January|June|November|December|
+--------+-------+----+--------+--------+
|    INFO|   null|null|    null|       2|
|   ERROR|   null|   1|    null|    null|
|    WARN|   null|null|    null|       1|
|   DEBUG|      1|   3|       1|    null|
+--------+-------+----+--------+--------+



In [37]:
spark.sql("""select loglevel,
          date_format(logtime , "MMMM") as month_name
          from logs""") \
            .groupBy("loglevel") \
            .pivot("month_name",month_list) \
            .count() \
            .show()

+--------+-------+----+--------+--------+
|loglevel|January|June|November|December|
+--------+-------+----+--------+--------+
|    INFO|   null|null|    null|       2|
|   ERROR|   null|   1|    null|    null|
|    WARN|   null|null|    null|       1|
|   DEBUG|      1|   3|       1|    null|
+--------+-------+----+--------+--------+



In [38]:
spark.stop()