In [1]:
import os
import sys
import re
import random
import pathlib
import findspark

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark import SparkConf

from apache_log_parser import ApacheAccessLog


os.environ['SPARK_HOME'] = '/Users/audioworkstation/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/spark-3.5.0-bin-hadoop3'
os.environ['PYSPARK_DEIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'


findspark.init()
findspark.find()

'/Users/audioworkstation/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/spark-3.5.0-bin-hadoop3'

# apache log parser example

In [2]:
random.seed(15)
conf = (SparkConf().setMaster('local[2]').setAppName('TextUpdater').set('spark.executer.memory', '2g'))
sc = SparkContext(conf=conf)
ssc = StreamingContext(sparkContext=sc, batchDuration=30)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 14:00:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
ssc.checkpoint('MyCheckPoint')

In [4]:
curr = pathlib.Path().resolve()
logs_directory = os.path.join(curr / 'logs')

In [5]:
logs_directory

'/Users/audioworkstation/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/Transform Operations/logs'

In [6]:
log_data = ssc.textFileStream(logs_directory)
access_log_dstream = log_data.map(
    ApacheAccessLog.parse_from_log_line
).filter(
    lambda parsed_line: parsed_line is not None
)

access_log_dstream.pprint(num=30)

In [7]:
def map_ip_values(rdd):
    return rdd.map(lambda parsed_line: (parsed_line.ip, 1))

### functions that are supplied to transform get called in every batch interval
### means that we can create operations for rdds that vary by time

# BUT WHAT IS THE DIFFERENCE BETWEEN THESE TWO LINES BELOW?

### This (transform) is useful when you need to perform operations that involve the entire batch, such as aggregations, sorting, or any operation that requires context across multiple elements.

### map produces a one-to-one transformation, meaning each input element produces exactly one output element.
### transform allows you to produce a different number of output elements than the number of input elements. This can be useful in various scenarios.

### here's an example:
```
def complex_transform(rdd):
    # Perform a complex transformation on the entire batch
    # This function can involve filtering, sorting, or any custom logic
    return rdd.filter(lambda x: x > 5).map(lambda x: (x, x * 2))

transformed_dstream = original_dstream.transform(complex_transform)

```

In [8]:
transformed_access_log_dstream = access_log_dstream.transform(map_ip_values)  # It's like saying: "For the entire set of log entries, do this specific operation."
# transformed_access_log_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1)) # It's like saying: "For each log entry, give me a tuple with the IP and a 1."
transformed_access_log_dstream.pprint(num=30)

In [None]:
ssc.start()

                                                                                

-------------------------------------------
Time: 2023-11-15 14:03:30
-------------------------------------------
64.242.88.10 - - [07/Mar/2004:21:14:32 -0800] "GET /twiki/bin/rdiff/TWiki/FileAttribute HTTP/1.1" 200 
h24-70-56-49.ca.shawcable.net - - [07/Mar/2004:21:16:17 -0800] "GET /twiki/view/Main/WebHome HTTP/1.1" 404 

-------------------------------------------
Time: 2023-11-15 14:03:30
-------------------------------------------
('64.242.88.10', 1)
('h24-70-56-49.ca.shawcable.net', 1)

-------------------------------------------
Time: 2023-11-15 14:04:00
-------------------------------------------

-------------------------------------------
Time: 2023-11-15 14:04:00
-------------------------------------------



##### after starting ssc you can move your logs into the log folder

In [None]:
ssc.stop(stopSparkContext=True, stopGraceFully=True)