# Configuration
## Import needed librairies and dependencies

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict
from collections import Counter

In [2]:
## Import data
data_df = pd.read_csv('result/HDFS/HDFS_ReadToExploitData.csv', engine='c', na_filter=False, memory_map=True)

# Spliting the data into training and testing subsets
We split the data into 70% training data and 30% testing data.

In [3]:
def _split_data(x_data, y_data, train_ratio=0.5):
    pos_idx = y_data > 0
    x_pos = x_data[pos_idx]
    y_pos = y_data[pos_idx]
    x_neg = x_data[~pos_idx]
    y_neg = y_data[~pos_idx]
    train_pos = int(train_ratio * x_pos.shape[0])
    train_neg = int(train_ratio * x_neg.shape[0])
    x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
    y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
    x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
    y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])

    return (x_train, y_train), (x_test, y_test)

Suffle and split the data into 70% training and 30% testing data.

In [4]:
# Shuffle the data
data_df = data_df.sample(frac=1).reset_index(drop=True)
data_df.head(5)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_4969054556300945467,"['09a53393', '09a53393', '3d91fa85', '09a53393...",0
1,blk_-7263169389307688606,"['09a53393', '3d91fa85', '09a53393', '09a53393...",0
2,blk_9046758658810461319,"['3d91fa85', '09a53393', '09a53393', '09a53393...",0
3,blk_-6769102211997257430,"['09a53393', '09a53393', '09a53393', '3d91fa85...",0
4,blk_4747591041845686405,"['09a53393', '3d91fa85', '09a53393', '09a53393...",0


In [5]:
# Split train and test data
train_ratio = 0.7
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
    data_df['Label'].values, train_ratio)

In [6]:
num_train = x_train.shape[0]
num_test = x_test.shape[0]
num_total = num_train + num_test
num_train_pos = sum(y_train)
num_test_pos = sum(y_test)
num_pos = num_train_pos + num_test_pos

print('Total: {} instances, {} anomaly, {} normal' \
      .format(num_total, num_pos, num_total - num_pos))
print('Train: {} instances, {} anomaly, {} normal' \
      .format(num_train, num_train_pos, num_train - num_train_pos))
print('Test: {} instances, {} anomaly, {} normal\n' \
      .format(num_test, num_test_pos, num_test - num_test_pos))

#print(type(x_train))
print('====== x_train (first five lines) ======')
print(x_train[:5])

print('====== y_train (first five lines) ======')
print(y_train[:5])

Total: 575061 instances, 16838 anomaly, 558223 normal
Train: 402542 instances, 11786 anomaly, 390756 normal
Test: 172519 instances, 5052 anomaly, 167467 normal

["['09a53393', '3d91fa85', '09a53393', '09a53393', '0567184d', '0567184d', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '0567184d', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', 'dba996ef']"
 "['3d91fa85', '09a53393', '09a53393', '09a53393', '5d5de21c', '5d5de21c', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '626085d5', '626085d5', '81cee340', '626085d5', '626085d5', '81cee340', '81cee340', '626085d5', '626085d5', '81cee340', '626085d5', '626085d5', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '8f2bc724', '5d5de21c', 'dba996ef']"
 "['09a53393', '3d91fa85', '09a53393', '09a53393', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '728076ac', '09a53393', 

The **transform_train_data** function takes a collection of log sequences X_seq as input. It counts the occurrences of each event in each log sequence and stores these counts in a structured format. It then returns two things: a feature matrix X where rows represent log sequences and columns represent event counts, and a list of event names events. This transformation allows the log data to be used in machine learning models for anomaly detection.

Row = block_id
Colum = Event Type
x_train[i][j] = nb of event #j in block_id #i 

# Feature extraction

## Feature extraction : Message Count Matrix = Message Count Vector for each block_id

In [15]:
def transform_train_data(X_seq):
    X_counts = []
    for i in range(X_seq.shape[0]):
        event_counts = Counter(X_seq[i])
        X_counts.append(event_counts)
    X_df = pd.DataFrame(X_counts)
    X_df = X_df.fillna(0)
    events = X_df.columns
    X = X_df.values
    return (X, events)

transformed = transform_train_data(x_train)
x_train_mcv = transformed[0]
# The events in the training data,
# used later to ingore the events in the test data that is unseen in the training data
events_mcv = transformed[1]

print('====== Transformed train data summary ======')
print('Train data shape: {}-by-{}\n'.format(x_train_mcv.shape[0], x_train_mcv.shape[1]))
print(x_train_mcv[:5])

Train data shape: 402542-by-21

[[ 1. 44.  9. 13. 13. 16. 22. 21. 21. 25. 10. 10. 13. 15.  3.  3. 12.  6.
   3.  3.  1.]
 [ 1. 66. 18. 13. 13. 31. 26. 32. 32. 32. 12. 11. 23. 28.  1.  5. 21. 17.
   9.  4.  1.]
 [ 1. 68. 12. 23. 19. 25. 32. 33. 33. 32. 12. 12. 12. 22. 12.  6. 19. 13.
  12.  9.  1.]
 [ 1. 60. 14. 16. 15. 18. 34. 29. 29. 24. 12. 12. 18. 18. 12.  7. 16. 12.
   6.  6.  1.]
 [ 1.  8.  3.  6.  3.  3.  7.  3.  3.  2.  2.  2.  1.  0.  0.  0.  0.  0.
   2.  1.  1.]]
Index(['[', ''', '0', '9', 'a', '5', '3', ',', ' ', 'd', '1', 'f', '8', '6',
       '7', '4', 'e', '2', 'c', 'b', ']'],
      dtype='object')


In [9]:
def transform_test_data(X_seq, events):
    X_counts = []
    for i in range(X_seq.shape[0]):
        event_counts = Counter(X_seq[i])
        X_counts.append(event_counts)
    X_df = pd.DataFrame(X_counts)
    X_df = X_df.fillna(0)
    # treat the counts of the missing events as 0s
    empty_events = set(events) - set(X_df.columns)
    for event in empty_events:
        X_df[event] = [0] * len(X_df)
    X = X_df[events].values
    return X

x_test_mcv = transform_test_data(x_test, events_mcv)

print('====== Transformed test data summary ======')
print('Test data shape: {}-by-{}\n'.format(x_test_mcv.shape[0], x_test_mcv.shape[1]))
print(x_test_mcv[:5])

Test data shape: 172519-by-21

[[ 1.  8.  3.  6.  3.  3.  7.  3.  3.  2.  2.  2.  1.  0.  0.  0.  0.  0.
   2.  1.  1.]
 [ 1. 52.  9. 18. 16. 18. 27. 25. 25. 26.  9. 11. 11. 17.  6.  3. 15.  9.
   7.  6.  1.]
 [ 1.  8.  3.  6.  3.  3.  7.  3.  3.  2.  2.  2.  1.  0.  0.  0.  0.  0.
   2.  1.  1.]
 [ 1. 40.  6. 13. 13. 13. 23. 19. 19. 22.  7. 10. 11. 13.  0.  0. 13.  7.
   6.  3.  1.]
 [ 1. 52.  9. 18. 16. 18. 27. 25. 25. 26.  9. 11. 11. 17.  6.  3. 15.  9.
   7.  6.  1.]]


## Feature extraction : TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer() 

def transform_train_data_tfidf(X_seq):
    tokenized_sequences = [eval(x) for x in X_seq]

    # Join the tokens back into space-separated strings (required for TF-IDF)
    document_strings = [' '.join(x) for x in tokenized_sequences]

    # Initialize the TF-IDF vectorizer
#     tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the event sequences into TF-IDF vectors
    tfidf_matrix = tfidf_vectorizer.fit_transform(document_strings)

    # Convert the TF-IDF matrix to a DataFrame for further analysis if needed
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    return tfidf_df

# Assuming x_train contains your training data
x_train_tf_idf = transform_train_data_tfidf(x_train)

print('====== Transformed test data summary -- TF-IDF ======')
print('Test data shape: {}-by-{}\n'.format(x_train_tf_idf.shape[0], x_train_tf_idf.shape[1]))
print(x_train_tf_idf[:5])

["['09a53393', '3d91fa85', '09a53393', '09a53393', '0567184d', '0567184d', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '0567184d', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', 'dba996ef']"
 "['3d91fa85', '09a53393', '09a53393', '09a53393', '5d5de21c', '5d5de21c', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '626085d5', '626085d5', '81cee340', '626085d5', '626085d5', '81cee340', '81cee340', '626085d5', '626085d5', '81cee340', '626085d5', '626085d5', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '8f2bc724', '5d5de21c', 'dba996ef']"
 "['09a53393', '3d91fa85', '09a53393', '09a53393', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '728076ac', '09a53393', '40651754', 'd6b7b743', '73c2ec69', '5d5de21c', '5d5de21c', 'dba996ef', '728076ac', '40651754', '09a53393', '73c2ec69', 'd6b7b743', '5d5de21c', '5d5de21c', 'dba9

In [17]:
def transform_test_data_tfidf(X_seq, tfidf_vectorizer):
    tokenized_sequences = [eval(x) for x in X_seq]

    # Join the tokens back into space-separated strings (required for TF-IDF)
    document_strings = [' '.join(x) for x in tokenized_sequences]

    # Transform the test data using the same TF-IDF vectorizer as used for training
    tfidf_matrix = tfidf_vectorizer.transform(document_strings)

    # Convert the TF-IDF matrix to a DataFrame for further analysis if needed
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    return tfidf_df

# Assuming x_test contains your test data
x_test_tf_idf = transform_test_data_tfidf(x_test, tfidf_vectorizer)

print('====== Transformed test data summary -- TF-IDF ======')
print('Test data shape: {}-by-{}\n'.format(x_test_tf_idf.shape[0], x_test_tf_idf.shape[1]))
print(x_test_tf_idf[:5])

Test data shape: 172519-by-48

   0567184d  06d16156  09a53393  0f86472a  124068c6  13eb7010  234302e6  \
0       0.0       0.0  0.303902       0.0       0.0       0.0       0.0   
1       0.0       0.0  0.270860       0.0       0.0       0.0       0.0   
2       0.0       0.0  0.303902       0.0       0.0       0.0       0.0   
3       0.0       0.0  0.305725       0.0       0.0       0.0       0.0   
4       0.0       0.0  0.270860       0.0       0.0       0.0       0.0   

   2e68ccc3  2ecc047e  2f85639c  ...  d63ef163  d6b7b743  dba996ef  e024fa48  \
0  0.000000       0.0       0.0  ...  0.000000  0.000000  0.000000       0.0   
1  0.000000       0.0       0.0  ...  0.246328  0.385244  0.327483       0.0   
2  0.000000       0.0       0.0  ...  0.000000  0.000000  0.000000       0.0   
3  0.579311       0.0       0.0  ...  0.370714  0.000000  0.369638       0.0   
4  0.000000       0.0       0.0  ...  0.246328  0.385244  0.327483       0.0   

   e3df2680  f0d1ff15  f266840a  f798

# Classique Machine learning

## Constructing a logistic regression model

In [20]:
lr_classifier = LogisticRegression(penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=1000)

lr_classifier.fit(x_train_mcv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=100, max_iter=1000, tol=0.01)

## Evaluate the performance of the logistic regression model

We evaluate the model's performance on the testing set.

In [None]:
## Precision, recall, f1-score, and precission-recall cu