In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("logs").getOrCreate()

In [2]:
logs_data = [("DEBUG","2014-6-22 21:30:49"),
("WARN","2013-12-6 17:54:15"),
("DEBUG","2017-1-12 10:47:02"),
("DEBUG","2016-6-25 11:06:42"),
("ERROR","2015-6-28 19:25:05"),
("DEBUG","2012-6-24 01:06:37"),
("INFO","2014-12-9 09:53:54"),
("DEBUG","2015-11-8 19:20:08"),
("INFO","2017-12-21 18:34:18")]

In [3]:
logs_df = spark.createDataFrame(logs_data).toDF("loglevel","logtime")

In [4]:
logs_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG| 2014-6-22 21:30:49|
|    WARN| 2013-12-6 17:54:15|
|   DEBUG| 2017-1-12 10:47:02|
|   DEBUG| 2016-6-25 11:06:42|
|   ERROR| 2015-6-28 19:25:05|
|   DEBUG| 2012-6-24 01:06:37|
|    INFO| 2014-12-9 09:53:54|
|   DEBUG| 2015-11-8 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [5]:
logs_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: string (nullable = true)



In [6]:
from pyspark.sql.functions import *

In [8]:
log_df = logs_df.withColumn("logtime", to_timestamp("logtime"))

In [9]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [10]:
log_df.createOrReplaceTempView("logs")

In [11]:
spark.sql("select * from logs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG|2014-06-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [45]:
spark.sql("""select loglevel ,logtime,
          date_format(logtime , 'MMMM') as full_month,
          date_format(logtime , 'MMM') as half_month,
          date_format(logtime , 'MM') as num_month,
          date_format(logtime , 'M') as short_month,
          date_format(logtime , 'y') as year,
          date_format(logtime , 'd') as date,
          date_format(logtime , 'H') as hour24f,
          date_format(logtime , 'h') as hour12f,
          date_format(logtime , 'm') as minute,
          date_format(logtime , 's') as second,
          date_format(logtime , 'MM/dd/yyy') as required_format,
          date_format(logtime , 'D') as doy,
          date_format(logtime , 'E') as dow,
          date_format(logtime , 'F') as ndow,
          date_format(logtime , 'a') as ampm,
          date_format(logtime , 'q') as quater
          from logs""").show()

+--------+-------------------+----------+----------+---------+-----------+----+----+-------+-------+------+------+---------------+---+---+----+----+------+
|loglevel|            logtime|full_month|half_month|num_month|short_month|year|date|hour24f|hour12f|minute|second|required_format|doy|dow|ndow|ampm|quater|
+--------+-------------------+----------+----------+---------+-----------+----+----+-------+-------+------+------+---------------+---+---+----+----+------+
|   DEBUG|2014-06-22 21:30:49|      June|       Jun|       06|          6|2014|  22|     21|      9|    30|    49|     06/22/2014|173|Sun|   1|  PM|     2|
|    WARN|2013-12-06 17:54:15|  December|       Dec|       12|         12|2013|   6|     17|      5|    54|    15|     12/06/2013|340|Fri|   6|  PM|     4|
|   DEBUG|2017-01-12 10:47:02|   January|       Jan|       01|          1|2017|  12|     10|     10|    47|     2|     01/12/2017| 12|Thu|   5|  AM|     1|
|   DEBUG|2016-06-25 11:06:42|      June|       Jun|       06|  

In [41]:
spark.sql("select loglevel ,date_format(logtime , 'MMMM') as month from logs").show()

+--------+-------------------+---+
|loglevel|            logtime|new|
+--------+-------------------+---+
|   DEBUG|2014-06-22 21:30:49|  2|
|    WARN|2013-12-06 17:54:15|  4|
|   DEBUG|2017-01-12 10:47:02|  1|
|   DEBUG|2016-06-25 11:06:42|  2|
|   ERROR|2015-06-28 19:25:05|  2|
|   DEBUG|2012-06-24 01:06:37|  2|
|    INFO|2014-12-09 09:53:54|  4|
|   DEBUG|2015-11-08 19:20:08|  4|
|    INFO|2017-12-21 18:34:18|  4|
+--------+-------------------+---+



In [50]:
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,loglevel
          order by month asc
          """).show()

+--------+--------+---------------+
|   month|loglevel|total_occurance|
+--------+--------+---------------+
|December|    WARN|              1|
|December|    INFO|              2|
| January|   DEBUG|              1|
|    June|   DEBUG|              3|
|    June|   ERROR|              1|
|November|   DEBUG|              1|
+--------+--------+---------------+



In [54]:
#M groupby without casting not expected op
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          date_format(logtime ,"M") as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,month_number,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|           1|   DEBUG|              1|
|November|          11|   DEBUG|              1|
|December|          12|    WARN|              1|
|December|          12|    INFO|              2|
|    June|           6|   DEBUG|              3|
|    June|           6|   ERROR|              1|
+--------+------------+--------+---------------+



In [55]:
#M max without casting not expected op
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          max(date_format(logtime ,"M")) as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|           1|   DEBUG|              1|
|November|          11|   DEBUG|              1|
|December|          12|    INFO|              2|
|December|          12|    WARN|              1|
|    June|           6|   DEBUG|              3|
|    June|           6|   ERROR|              1|
+--------+------------+--------+---------------+



In [56]:
#M groupby  with cast
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          int(date_format(logtime ,"M")) as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,month_number,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|           1|   DEBUG|              1|
|    June|           6|   DEBUG|              3|
|    June|           6|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    WARN|              1|
|December|          12|    INFO|              2|
+--------+------------+--------+---------------+



In [57]:
#M max with int cast
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          max(int(date_format(logtime ,"M"))) as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|           1|   DEBUG|              1|
|    June|           6|   DEBUG|              3|
|    June|           6|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    WARN|              1|
|December|          12|    INFO|              2|
+--------+------------+--------+---------------+



In [58]:
#MM max without cast
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          max(date_format(logtime ,"MM")) as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|          01|   DEBUG|              1|
|    June|          06|   DEBUG|              3|
|    June|          06|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    INFO|              2|
|December|          12|    WARN|              1|
+--------+------------+--------+---------------+



In [59]:
#MM groupby without cast
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          date_format(logtime ,"MM") as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,month_number,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|          01|   DEBUG|              1|
|    June|          06|   DEBUG|              3|
|    June|          06|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    WARN|              1|
|December|          12|    INFO|              2|
+--------+------------+--------+---------------+



In [61]:
spark.sql("""
          select 
          date_format(logtime ,"MMMM") as month,
          first(date_format(logtime ,"MM")) as month_number,
          loglevel,
          count(*) as total_occurance
          from logs
          group by month,loglevel
          order by month_number asc
          """).show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|          01|   DEBUG|              1|
|    June|          06|   DEBUG|              3|
|    June|          06|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    INFO|              2|
|December|          12|    WARN|              1|
+--------+------------+--------+---------------+



In [62]:
results_df = spark.sql("""
                    select 
                    date_format(logtime ,"MMMM") as month,
                    first(date_format(logtime ,"MM")) as month_number,
                    loglevel,
                    count(*) as total_occurance
                    from logs
                    group by month,loglevel
                    order by month_number asc
                    """)

results_df.show()

+--------+------------+--------+---------------+
|   month|month_number|loglevel|total_occurance|
+--------+------------+--------+---------------+
| January|          01|   DEBUG|              1|
|    June|          06|   DEBUG|              3|
|    June|          06|   ERROR|              1|
|November|          11|   DEBUG|              1|
|December|          12|    INFO|              2|
|December|          12|    WARN|              1|
+--------+------------+--------+---------------+



In [63]:
final_df = results_df.drop("month_number")
final_df.show()

+--------+--------+---------------+
|   month|loglevel|total_occurance|
+--------+--------+---------------+
| January|   DEBUG|              1|
|    June|   DEBUG|              3|
|    June|   ERROR|              1|
|November|   DEBUG|              1|
|December|    INFO|              2|
|December|    WARN|              1|
+--------+--------+---------------+

