In [1]:
# from pyspark.context import SparkContext
# from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.conf import SparkConf

# load up other dependencies
import re
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.functions import regexp_extract

In [2]:
# sc = SparkContext()
# sqlContext = SQLContext(sc)
# Set Spark Configuration
conf = SparkConf() \
    .setMaster("local[*]") \
    .setAppName("LogAnalytics") \
    .setExecutorEnv("spark.executor.memory", "4g") \
    .setExecutorEnv("spark.driver.memory", "4g")


# Create a SparkSession
spark = SparkSession.builder \
    .config(conf = conf) \
    .getOrCreate()

23/05/28 16:01:06 WARN Utils: Your hostname, Tanmays-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.123 instead (on interface en0)
23/05/28 16:01:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/28 16:01:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/28 16:01:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.text("/Users/tanmaysingla/Downloads/17GBBigServerLog.log")

In [4]:
df.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
df.count()

                                                                                

119999999

In [6]:
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4})\]'
method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
status_pattern = r'\s(\d{3})\s'
content_size_pattern = r'\s(\d+)$'

In [7]:
logs_df = df.select(regexp_extract('value', host_pattern, 1).alias('host'),
                         regexp_extract('value', ts_pattern, 1).alias('timestamp'),
                         regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
                         regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
                         regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
                         regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
                         regexp_extract('value', content_size_pattern, 1).cast('integer').alias('content_size'))
logs_df.show(10, truncate=True)
print((logs_df.count(), len(logs_df.columns)))
logs_df.cache()

+---------------+--------------------+------+--------------------+--------+------+------------+
|           host|           timestamp|method|            endpoint|protocol|status|content_size|
+---------------+--------------------+------+--------------------+--------+------+------------+
|197.217.102.239|26/Dec/2118:12:00...|  POST|/Archives/edgar/d...|HTTP/1.0|   200|       17286|
| 67.198.123.159|26/Dec/2118:12:00...|DELETE|/Archives/edgar/d...|HTTP/1.0|   500|       25023|
| 93.136.152.153|26/Dec/2118:12:00...|DELETE|/Archives/edgar/d...|HTTP/1.0|   304|       51136|
|187.198.113.102|26/Dec/2118:12:00...|  POST|/Archives/edgar/d...|HTTP/1.0|   200|        1266|
| 34.180.234.250|26/Dec/2118:12:00...|  POST|/Archives/edgar/d...|HTTP/1.0|   304|       54427|
|    63.1.139.91|26/Dec/2118:12:00...|   GET|/Archives/edgar/d...|HTTP/1.0|   502|       45037|
| 181.39.225.239|26/Dec/2118:12:00...|   GET|/Archives/edgar/d...|HTTP/1.0|   303|       52160|
|   78.154.11.19|26/Dec/2118:12:00...|DE



(119999999, 7)


                                                                                

DataFrame[host: string, timestamp: string, method: string, endpoint: string, protocol: string, status: int, content_size: int]

In [9]:
month_map = {
  'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
  'Aug':8,  'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12
}

def parse_clf_time(text):
    """ Convert Common Log time format into a Python datetime object
    Args:
        text (str): date and time in Apache time format [dd/mmm/yyyy:hh:mm:ss (+/-)zzzz]
    Returns:
        a string suitable for passing to CAST('timestamp')
    """
    # NOTE: We're ignoring the time zones here, might need to be handled depending on the problem you are solving
    return "{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}".format(
      int(text[7:11]),
      month_map[text[3:6]],
      int(text[0:2]),
      int(text[12:14]),
      int(text[15:17]),
      int(text[18:20])
    )

In [10]:
udf_parse_time = udf(parse_clf_time)

logs_df_with_time = (logs_df.select('*', udf_parse_time(logs_df['timestamp']).cast('timestamp').alias('time')).drop('timestamp'))
logs_df_with_time.show(10, truncate=True)

[Stage 10:>                                                         (0 + 1) / 1]

+---------------+------+--------------------+--------+------+------------+-------------------+
|           host|method|            endpoint|protocol|status|content_size|               time|
+---------------+------+--------------------+--------+------+------------+-------------------+
|197.217.102.239|  POST|/Archives/edgar/d...|HTTP/1.0|   200|       17286|2118-12-26 12:00:00|
| 67.198.123.159|DELETE|/Archives/edgar/d...|HTTP/1.0|   500|       25023|2118-12-26 12:00:00|
| 93.136.152.153|DELETE|/Archives/edgar/d...|HTTP/1.0|   304|       51136|2118-12-26 12:00:00|
|187.198.113.102|  POST|/Archives/edgar/d...|HTTP/1.0|   200|        1266|2118-12-26 12:00:00|
| 34.180.234.250|  POST|/Archives/edgar/d...|HTTP/1.0|   304|       54427|2118-12-26 12:00:00|
|    63.1.139.91|   GET|/Archives/edgar/d...|HTTP/1.0|   502|       45037|2118-12-26 12:00:00|
| 181.39.225.239|   GET|/Archives/edgar/d...|HTTP/1.0|   303|       52160|2118-12-26 12:00:00|
|   78.154.11.19|DELETE|/Archives/edgar/d...|HTTP/

                                                                                

In [11]:
logs_df_with_time.cache()

DataFrame[host: string, method: string, endpoint: string, protocol: string, status: int, content_size: int, time: timestamp]

In [15]:
week_map = {
    1: "Sunday",
    2: "Monday",
    3: "Tuesday",
    4: "Wednesday",
    5: "Thursday",
    6: "Friday",
    7: "Saturday"
}

def parse_day_of_week(dayOfWeek):
    return week_map[dayOfWeek]

In [17]:
udf_parse_day = udf(parse_day_of_week)
enpoint_day_of_week_df = logs_df_with_time.select(logs_df_with_time.endpoint, 
                             udf_parse_day(F.dayofweek('time')).alias("dayOfWeek"))
enpoint_day_of_week_df.show(5, truncate=False)

+---------------------------------------------------------------------------+---------+
|endpoint                                                                   |dayOfWeek|
+---------------------------------------------------------------------------+---------+
|/Archives/edgar/data/0001179929/000117992922000054/a2022definitiveproxy.htm|Monday   |
|/Archives/edgar/data/1935285/0001445546-23-001552-index.htm                |Monday   |
|/Archives/edgar/data/0001179929/000117992923000043/moh-20230320_g26.jpg    |Monday   |
|/Archives/edgar/data/0000019957/000110465921115352/tm2127556d2_424b3.htm   |Monday   |
|/Archives/edgar/data/0001798682/000121390021017167/s131143_10k.htm         |Monday   |
+---------------------------------------------------------------------------+---------+
only showing top 5 rows



In [18]:
highest_invocations_df = (enpoint_day_of_week_df
                          .groupBy("endpoint", "dayOfWeek")
                          .count()
                          .sort("count", ascending=False)
                          .select(enpoint_day_of_week_df.dayOfWeek.alias("Day in a Week"), enpoint_day_of_week_df.endpoint, "count"))
highest_invocations_df.show(5, truncate=False)

23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:28 WARN RowBasedKeyValueBatch: Calling spill() on

23/05/28 16:24:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:44 WARN MemoryStore: Not enough space to cache rdd_46_30 in memory! (computed 17.6 MiB so far)
23/05/28 16:24:44 WARN MemoryStore: Not enough space to cache rdd_46_24 in memory! (computed 17.6 MiB so far)
23/05/28 16:24:44 WARN MemoryStore: Not enough space to cache rdd_46_25 in memory! (computed 17.6 MiB so far)
23/05/28 16:24:44 WARN MemoryStore: Not enough space to cache rdd_46_31 in memory! (computed 17.6 MiB so far)
23/05/28 16:24:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:24:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will

23/05/28 16:25:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:04 WARN MemoryStore: Not enough space to cache rdd_46_56 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:04 WARN MemoryStore: Not enough space to cache rdd_46_59 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:04 WARN MemoryStore: Not enough space to cache rdd_46_61 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:04 WARN MemoryStore: Not enough space to cache rdd_46_57 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will

23/05/28 16:25:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:22 WARN MemoryStore: Not enough space to cache rdd_46_93 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:22 WARN MemoryStore: Not enough space to cache rdd_46_91 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:22 WARN MemoryStore: Not enough space to cache rdd_46_95 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:22 WARN MemoryStore: Not enough space to cache rdd_46_88 in memory! (computed 17.6 MiB so far)
23/05/28 16:25:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will

23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:25:39 WARN RowBasedKeyValueBatch: Calling spill() on

+-------------+-----------------------------------------------------------------------------+------+
|Day in a Week|endpoint                                                                     |count |
+-------------+-----------------------------------------------------------------------------+------+
|Monday       |/Archives/edgar/data/0001724128/000121390021017151/s131120_10k.htm           |802074|
|Monday       |/Archives/edgar/data/0000205007/000114554923015369/0001145549-23-015369.txt  |801862|
|Monday       |/Archives/edgar/data/1826671/000121390023023294/0001213900-23-023294.txt     |801791|
|Monday       |/Archives/edgar/data/0001866295/000110465921080403/tm2117388d5_ex3-32g008.jpg|801664|
|Monday       |/Archives/edgar/data/0001330427/000093247112005362/0000932471-12-005362.txt  |801652|
+-------------+-----------------------------------------------------------------------------+------+
only showing top 5 rows



                                                                                

In [19]:
highest_invocations_df.show(1, truncate=False)

23/05/28 16:26:14 WARN MemoryStore: Not enough space to cache rdd_46_0 in memory! (computed 17.6 MiB so far)
23/05/28 16:26:14 WARN MemoryStore: Not enough space to cache rdd_46_3 in memory! (computed 17.6 MiB so far)
23/05/28 16:26:14 WARN MemoryStore: Not enough space to cache rdd_46_7 in memory! (computed 17.6 MiB so far)
23/05/28 16:26:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. 

23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:26:49 WARN RowBasedKeyValueBatch: Calling spill() on

23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:04 WARN RowBasedKeyValueBatch: Calling spill() on

23/05/28 16:27:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/05/28 16:27:25 WARN RowBasedKeyValueBatch: Calling spill() on

+-------------+------------------------------------------------------------------+------+
|Day in a Week|endpoint                                                          |count |
+-------------+------------------------------------------------------------------+------+
|Monday       |/Archives/edgar/data/0001724128/000121390021017151/s131120_10k.htm|802074|
+-------------+------------------------------------------------------------------+------+
only showing top 1 row



                                                                                

In [21]:
not_found_df = logs_df_with_time.filter(logs_df["status"] == 404).cache()
yearly_404_sorted_df = (not_found_df
                        .select(F.year("time").alias("Year"))
                        .groupBy("Year")
                        .count()
                        .sort("count", ascending=True).limit(10))
yearly_404_sorted_df.show(10, truncate=False)

23/05/28 16:29:01 WARN MemoryStore: Not enough space to cache rdd_46_3 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:01 WARN MemoryStore: Not enough space to cache rdd_46_5 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:01 WARN MemoryStore: Not enough space to cache rdd_46_6 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:01 WARN MemoryStore: Not enough space to cache rdd_46_1 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:01 WARN MemoryStore: Not enough space to cache rdd_46_7 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:04 WARN MemoryStore: Not enough space to cache rdd_46_15 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:04 WARN MemoryStore: Not enough space to cache rdd_46_12 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:04 WARN MemoryStore: Not enough space to cache rdd_46_10 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:04 WARN MemoryStore: Not enough space to cache rdd_46_11 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:04 

23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_90 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_91 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_94 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_95 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_89 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_88 in memory! (computed 9.0 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_93 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:24 WARN MemoryStore: Not enough space to cache rdd_46_92 in memory! (computed 17.6 MiB so far)
23/05/28 16:29:26 WARN MemoryStore: Not enough space to cache rdd_46_102 in memory! (computed 17.6 MiB so far)
23/05/28 16:2

+----+--------+
|Year|count   |
+----+--------+
|2118|17141234|
+----+--------+



                                                                                