In [6]:
import re
import os
import argparse
import logging
import datetime
import pandas as pd
from datetime import date
from dataclasses import dataclass
import xml.etree.ElementTree as file
from pyspark import SparkContext
from pyspark.sql import SparkSession


#custom event frame definition
@dataclass
class extracted_event:
            eventtype: str
            resource: str
            operation: str
            operationDetail: str
            stationOrigin: str
            stationDestination: str
            start: str
            timestamp: datetime

                
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logging.info("------------------------------------------------------------------------")
logging.info("---------------------Starting the log parsing process-------------------")
logging.info("----------Seting up the environment...")
parser = argparse.ArgumentParser(description='Job Arguments')
parser.add_argument('--batchID')
args, unknown = parser.parse_known_args()
if args.batchID is None: 
    workingBatch = date.today().strftime("%H%m%d%Y")
else:
    workingBatch = args.batchID
workingPathHDFS = f'/user/akorobeinykov/korobeinykov-{workingBatch}' 
sourceDir = '/home/akorobeinykov/-MSc-QSE-Korobeinykov-/Data'
originalFileNames = []



#read all files from the directory
for fileName in os.listdir(sourceDir):
    if fileName.endswith(".xes"): 
        #originalFileNames.append(os.path.abspath(filename))
        originalFileNames.append([os.path.splitext(fileName)[0],os.path.splitext(fileName)[1]])
        continue
    else:
        continue

spark = SparkSession.builder.appName("AK-CleanAndInsert").master('local').getOrCreate()
sc = SparkContext.getOrCreate()

#iterating over all files in the log directory
for fileName in originalFileNames:
    logging.info(f"----------Processing log file {fileName} ...")
    logging.info("----------Parsing the data from XES log file into the dataframe...")
    # get the events and the data into a pandas frame
    tree = file.parse(f'{sourceDir}/{fileName[0]}{fileName[1]}')
    root = tree.getroot()
    extracted_events = []
    try:
        for event in root.findall('.//trace/event'):
            if event.find('./string[@key="operationType"]')!=None:
                    operationType=event.find('./string[@key="operationType"]').get('value')
                    #searching with regex for the first part before the dot - the resourceID
                    resourceName=re.search(r"^[^.]*", event.find('./string[@key="org:resource"]').get('value')).group(0)
                    operation=event.find('./string[@key="operationName"]').get('value')
                    detail=event.find('./string[@key="concept:name"]').get('value')
                    timestamp=event.find('./date[@key="time:timestamp"]').get('value')
                    so='-'
                    sd='-'                    
                    if ('shuttle' in resourceName) & (operationType=='MESoutput'):
                        try:
                            #adds the origin and destination for shuttle movenent if they exist, if not - ignore both set to "-"
                            so=event.find('./string[@key="StationIdOrigin"]').get('value')
                            sd=event.find('./string[@key="StationIdDestination"]').get('value')
                        except:
                            for string in event.findall('.//string'):
                                if 'StationIdDestination' in string.get('key'):
                                    sd=string.get('value')
                     
                    #will be parsing only responses from MES, as output is not relevant for this - reduced effort    
                    if (operationType=='MESinput') :
                        for string in event.findall('.//string'):
                            if  ('OperationStarted' in string.get('key')) | ('OperationFinished' in string.get('key')):
                                start=string.get('value')
                    else:
                        start='-'
                    #filling in final row object
                    extracted_events.append(extracted_event(operationType,resourceName,operation,detail,so,sd,start,timestamp))
        #converting the list into the dataframe
        df = pd.DataFrame(extracted_events)
        if df.empty!=True:     
            #only when log file was parsable and resulted in at least one valid event we add it to parsed events in hadoop
            logging.info("----------Saving the data to CSV on HDFS...")
            spark.createDataFrame(df).coalesce(1).write.mode('overwrite').option('header','true').csv(f'{workingPathHDFS}/parsed-{fileName[0]}')
            #spark.createDataFrame(tree).coalesce(1).write.mode("overwrite").save(f'{workingPathHDFS}/parsed-{fileName[0]}')
        else:
            logging.info(f"----------No logs found in file: {fileName} ! Excluding file...")
    except Exception as error:
        logging.info(f"----------No logs found in file: {fileName}! Excluding while: {error}")


#iterating over all original log files and do something?
for fileName in originalFileNames:
    logging.info(f"----------Cleaning the original Log file: {fileName}")
    w=fileName[0] + fileName[1]
    
#!echo "----------Listing the created directories..."
#!hadoop fs -ls -R $workingPathHDFS/ 

logging.info("---------------------------Process Finished!!!--------------------------")
logging.info("------------------------------------------------------------------------")

2021-03-01 19:41:11,025 - INFO - ------------------------------------------------------------------------
2021-03-01 19:41:11,026 - INFO - ---------------------Starting the log parsing process-------------------
2021-03-01 19:41:11,027 - INFO - ----------Seting up the environment...
2021-03-01 19:41:11,032 - INFO - ----------Processing log file ['example', '.xes'] ...
2021-03-01 19:41:11,033 - INFO - ----------Parsing the data from XES log file into the dataframe...
2021-03-01 19:41:11,040 - INFO - ----------Saving the data to CSV on HDFS...
2021-03-01 19:41:11,492 - INFO - ----------Processing log file ['EventLogTestbedMain_2020-10-23_10-15-52', '.xes'] ...
2021-03-01 19:41:11,495 - INFO - ----------Parsing the data from XES log file into the dataframe...
2021-03-01 19:41:11,502 - INFO - ----------Saving the data to CSV on HDFS...
2021-03-01 19:41:11,916 - INFO - ----------Processing log file ['example-empty', '.xes'] ...
2021-03-01 19:41:11,918 - INFO - ----------Parsing the data fro

----------Starting the log parsing process-----------
----------Seting up the environment
/home/akorobeinykov/-MSc-QSE-Korobeinykov-/example.xes
/home/akorobeinykov/-MSc-QSE-Korobeinykov-/example-1.xes
