In [8]:
import findspark
findspark.init()

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [10]:
import pandas as pd

In [11]:
import re

In [12]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField


In [13]:
from collections import OrderedDict


In [14]:
spark = SparkSession.builder\
        .appName("LogMining")\
        .master("local")\
        .config("spark.executor.cores","6")\
        .config("spark.executor.memory","2g")\
        .getOrCreate()

In [15]:
#StructField("EventId", StringType(), True),
#StructField("EventTemplate", StringType(), True)
# file = open(,"r")

In [16]:
from pyspark.sql import Row


In [17]:
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) (\S+) (\S+) (.*)'


In [18]:
logline = "081109 203518 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906"

In [19]:
type(logline)

str

In [20]:
match = re.findall(APACHE_ACCESS_LOG_PATTERN, logline)


In [21]:
match[0]

('081109',
 '203518',
 '35',
 'INFO',
 'dfs.FSNamesystem:',
 'BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906')

In [22]:
def parse_hdfs_log_line(logline):
    match = re.findall(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None:
        raise Error("Invalid logline: %s" % logline)
    return Row(
        Date =  match[0][0],
        Time    = match[0][1],
        Pid =   match[0][2],
        Level =  match[0][3],
        Component        = match[0][4],
        Content      = match[0][5])

In [23]:
abc = parse_hdfs_log_line(logline)

In [24]:
abc

Row(Component='dfs.FSNamesystem:', Content='BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906', Date='081109', Level='INFO', Pid='35', Time='203518')

In [25]:
# fo = open("abc.txt", "r")
# file_line = fo.readlines()


In [26]:
# file_line

In [27]:
# file_line[1]

In [28]:
customSchema = StructType([
    StructField("Date", IntegerType(), True),
    StructField("Time", IntegerType(), True),
    StructField("Pid", IntegerType(), True),
    StructField("Pid", IntegerType(), True),
    StructField("Component", StringType(), True),
    StructField("Content", StringType(), True),
#     StructField("EventId", StringType(), True),
#     StructField("EventTemplate", StringType(), True)
    
])

In [29]:
def parse_hdfs_file(file):
    ab = []
    file_line = file.readlines()
    for line in range(len(file_line)):
        match = re.findall(APACHE_ACCESS_LOG_PATTERN, file_line[line])
        for mat in range(len(match)):
            ab.append(Row(Date =  match[mat][0],Time = match[mat][1],Pid =match[mat][2],Level =  match[mat][3],Component   = match[mat][4],Content  = match[mat][5]))
    return ab 
    

In [30]:
fo = open("abc.txt", "r")


In [31]:
orkuh = parse_hdfs_file(fo)

In [32]:
orkuh

[Row(Component='dfs.DataNode$DataXceiver:', Content='Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010', Date='081109', Level='INFO', Pid='143', Time='203518'),
 Row(Component='dfs.DataNode$DataXceiver:', Content='Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010', Date='081109', Level='INFO', Pid='143', Time='203519'),
 Row(Component='dfs.DataNode$DataXceiver:', Content='Receiving block blk_-1608999687919862906 src: /10.250.14.224:42420 dest: /10.250.14.224:50010', Date='081109', Level='INFO', Pid='145', Time='203519'),
 Row(Component='dfs.DataNode$PacketResponder:', Content='PacketResponder 1 for block blk_-1608999687919862906 terminating', Date='081109', Level='INFO', Pid='145', Time='203519'),
 Row(Component='dfs.DataNode$PacketResponder:', Content='PacketResponder 2 for block blk_-1608999687919862906 terminating', Date='081109', Level='INFO', Pid='145', Time='203519')]

In [51]:
customSchema = StructType([
    StructField("Date", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Pid", StringType(), True),
    StructField("Pid", StringType(), True),
    StructField("Component", StringType(), True),
    StructField("Content", StringType(), True),
#   StructField("EventId", StringType(), True),
#     StructField("EventTemplate", StringType(), True)
    
])

In [52]:
df_train = spark.createDataFrame(orkuh,schema = customSchema)

In [53]:
df_train.show(5)

+------+------+---+---+--------------------+--------------------+
|  Date|  Time|Pid|Pid|           Component|             Content|
+------+------+---+---+--------------------+--------------------+
|081109|203518|143|143|dfs.DataNode$Data...|Receiving block b...|
|081109|203519|143|143|dfs.DataNode$Data...|Receiving block b...|
|081109|203519|145|145|dfs.DataNode$Data...|Receiving block b...|
|081109|203519|145|145|dfs.DataNode$Pack...|PacketResponder 1...|
|081109|203519|145|145|dfs.DataNode$Pack...|PacketResponder 2...|
+------+------+---+---+--------------------+--------------------+



In [78]:
content = "Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010"


In [64]:
 def generate_logformat_regex(logformat):
    ''' 
    Function to generate regular expression to split log messages
    '''
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

In [75]:
names = df_train.schema.Content


AttributeError: 'StructType' object has no attribute 'Content'

In [76]:
with open('slct_input.log', 'w') as fw:
    for line in df_train['Content'].():
        if rex:
            for currentRex in rex:
                line = re.sub(currentRex, '<*>', line)
            fw.write(line + '\n')


TypeError: 'Column' object is not callable

In [85]:
rex = r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?' 
lil = re.sub(rex,'<*>', content)


TypeError: first argument must be string or compiled pattern

In [82]:
lil

'<*>'

In [65]:
generate_logformat_regex(content)

error: bad escape \s at position 0

In [58]:
 def read_file(file):
    #struct_log = spark.read.format("csv").option("header", "true").load(file)
    data_dict = OrderedDict()
    for idx, row in enumerate(file):  
        blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
        blkId_set = set(blkId_list)
        for blk_Id in blkId_set:
            print(blk_Id)
            if not blk_Id in data_dict:
                data_dict[blk_Id] = []
            data_dict[blk_Id].append(row['EventId'])
    data_df = spark.createDataFrame(list(data_dict.items()), schema=['BlockId', 'EventSequence'])
    
    return data_df

LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate


In [59]:
df_train_append = read_file(orkuh)

blk_-1608999687919862906
blk_-1608999687919862906
blk_-1608999687919862906
blk_-1608999687919862906
blk_-1608999687919862906


ValueError: Some of types cannot be determined after inferring

In [None]:
# abc_log = spark.read.format("csv").option("header", "false").schema(customSchema).load("log.csv")


In [None]:
# abc_log.show(5)

In [None]:
# ab = read_file("log.csv")

In [None]:
# type(ab)

In [None]:
# ab.head(10)

In [None]:
schema 

In [None]:
ab = read_file("log.csv")

In [None]:
type(ab)

In [None]:
ab.show(10)

In [None]:
label_csv = spark.read.format("csv").option("header", "true").load("anomaly_label.csv")


In [None]:
label_csv.show(10)

In [None]:
# label_data = label_csv.set_index('BlockId')
# label_dict = label_data['Label'].to_dict()


In [None]:
# ab['Label'] = ab[ab['BlockId']].apply(lambda x: 1 if x == 'Anomaly' else 0)


In [None]:
# (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, data_df['Label'].values, train_ratio, split_type)

In [None]:
# label_data = pd.read_csv("anomaly_label.csv", engine='c', na_filter=False, memory_map=True)
# label_data = label_data.set_index('BlockId')
# label_dict = label_data['Label'].to_dict()
# ab.select[ab['BlockId'].rdd.flatmap(lambda x: 1 if label_dict[x] == 'Anomaly' else 0).toDF()
# # df.select("_c0").rdd.flatMap(lambda x: x + ("anything", )).toDF()


# # # Split trgiain and test data
# # (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, data_df['Label'].values, train_ratio, split_type)

In [None]:
train_x, test_x = ab.randomSplit([0.8, 0.2], seed=12345)

In [None]:
train_y, test_y = label_csv.randomSplit([0.8, 0.2], seed=12345)

In [None]:
train_x.show(10)