In [1]:
import os
import pathlib
import findspark

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark import SparkConf

from apache_log_parser import ApacheAccessLog

findspark.init()

# apache log parser example

In [2]:
conf = (SparkConf().setMaster('local[2]').setAppName('TextUpdater').set('spark.executer.memory', '2g'))
sc = SparkContext(conf=conf)
ssc = StreamingContext(sparkContext=sc, batchDuration=30)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/26 14:02:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/26 14:02:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
ssc.checkpoint('MyCheckPoint')

In [4]:
curr = pathlib.Path().resolve()
logs_directory = os.path.join(curr / 'logs')

In [5]:
logs_directory

'/Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/DStream_transform/logs'

In [6]:
log_data = ssc.textFileStream(logs_directory)

# in this code log_data is a DStream of RDDs. Data that received in each batch would be a single RDD with 0 or many items (items are lines of text here).
# so the input for the map function is a single element or text line from the batch RDD.   
# so each line in the log file would be mapped to an ApacheAccessLog instance or a None

access_log_dstream = log_data.map(
    ApacheAccessLog.parse_from_log_line
).filter(
    lambda parsed_line: parsed_line is not None
)

access_log_dstream.pprint(num=30)

In [7]:
def map_ip_values(rdd):
    return rdd.map(lambda parsed_line: (parsed_line.ip, 1))

### functions that are supplied to transform get called in every batch interval
### means that we can create operations for rdds that vary by time

# BUT WHAT IS THE DIFFERENCE BETWEEN THESE TWO LINES BELOW?

### This (transform) is useful when you need to perform operations that involve the entire batch, such as aggregations, sorting, or any operation that requires context across multiple elements.

### map produces a one-to-one transformation, meaning each input element produces exactly one output element.
### transform allows you to produce a different number of output elements than the number of input elements. This can be useful in various scenarios.

### here's an example:
```
def complex_transform(rdd):
    # Perform a complex transformation on the entire batch
    # This function can involve filtering, sorting, or any custom logic
    return rdd.filter(lambda x: x > 5).map(lambda x: (x, x * 2))

transformed_dstream = original_dstream.transform(complex_transform)

```

In [8]:
transformed_access_log_dstream = access_log_dstream.transform(map_ip_values)  # It's like saying: "For the entire set of log entries, do this specific operation."
# transformed_access_log_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1)) # It's like saying: "For each log entry, give me a tuple with the IP and a 1."


transformed_access_log_dstream.pprint(num=30)

In [9]:
ssc.start()

##### after starting ssc you can move your logs into the log folder

In [None]:
ssc.awaitTerminationOrTimeout(timeout=100)

-------------------------------------------
Time: 2024-03-26 14:04:00
-------------------------------------------

-------------------------------------------
Time: 2024-03-26 14:04:00
-------------------------------------------



Cannot parse logline: h194n2fls308o1033.telia.com - - [09/Mar/2004:13:49:05 -0800] "-" 408 -
                                                                                

-------------------------------------------
Time: 2024-03-26 14:04:30
-------------------------------------------
64.242.88.10 - - [07/Mar/2004:16:05:49 -0800] "GET /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables HTTP/1.1" 401 
64.242.88.10 - - [07/Mar/2004:16:06:51 -0800] "GET /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2 HTTP/1.1" 200 
64.242.88.10 - - [07/Mar/2004:16:10:02 -0800] "GET /mailman/listinfo/hsdivision HTTP/1.1" 200 
64.242.88.10 - - [07/Mar/2004:16:11:58 -0800] "GET /twiki/bin/view/TWiki/WikiSyntax HTTP/1.1" 200 
64.242.88.10 - - [07/Mar/2004:16:20:55 -0800] "GET /twiki/bin/view/Main/DCCAndPostFix HTTP/1.1" 200 
64.242.88.10 - - [07/Mar/2004:16:23:12 -0800] "GET /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore&param1=1.12&param2=1.12 HTTP/1.1" 200 
64.242.88.10 - - [07/Mar/2004:16:24:16 -0800] "GET /twiki/bin/view/Main/PeterThoeny HTTP/1.1" 200 
64.242.88.10 - - [07/Mar/2004:16:29:16 -0800] "GET /twiki/bin/edit/Main/

Cannot parse logline: h194n2fls308o1033.telia.com - - [09/Mar/2004:13:49:05 -0800] "-" 408 -
