# Configuration
## Import needed librairies and dependencies

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict
from collections import Counter

In [2]:
## Import data
data_df = pd.read_csv('result/HDFS/HDFS_ReadToExploitData.csv', engine='c', na_filter=False, memory_map=True)

# Spliting the data into training and testing subsets
We split the data into 70% training data and 30% testing data.

In [3]:
def _split_data(x_data, y_data, train_ratio=0.5):
    pos_idx = y_data > 0
    x_pos = x_data[pos_idx]
    y_pos = y_data[pos_idx]
    x_neg = x_data[~pos_idx]
    y_neg = y_data[~pos_idx]
    train_pos = int(train_ratio * x_pos.shape[0])
    train_neg = int(train_ratio * x_neg.shape[0])
    x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
    y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
    x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
    y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])

    return (x_train, y_train), (x_test, y_test)

Suffle and split the data into 70% training and 30% testing data.

In [4]:
# Shuffle the data
data_df = data_df.sample(frac=1).reset_index(drop=True)
data_df.head(5)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-3242034010863403271,"['09a53393', '3d91fa85', '09a53393', '09a53393...",0
1,blk_-7280517479581614900,"['3d91fa85', '09a53393', '09a53393', '09a53393...",0
2,blk_1698958929618729205,"['09a53393', '09a53393', '09a53393', '3d91fa85...",0
3,blk_5048469409603984999,"['09a53393', '09a53393', '09a53393', '3d91fa85...",0
4,blk_-4442849309709851367,"['09a53393', '09a53393', '09a53393', '3d91fa85...",0


In [5]:
# Split train and test data
train_ratio = 0.7
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
    data_df['Label'].values, train_ratio)

In [6]:
num_train = x_train.shape[0]
num_test = x_test.shape[0]
num_total = num_train + num_test
num_train_pos = sum(y_train)
num_test_pos = sum(y_test)
num_pos = num_train_pos + num_test_pos

print('Total: {} instances, {} anomaly, {} normal' \
      .format(num_total, num_pos, num_total - num_pos))
print('Train: {} instances, {} anomaly, {} normal' \
      .format(num_train, num_train_pos, num_train - num_train_pos))
print('Test: {} instances, {} anomaly, {} normal\n' \
      .format(num_test, num_test_pos, num_test - num_test_pos))

#print(type(x_train))
print('====== x_train (first five lines) ======')
print(x_train[:5])

print('====== y_train (first five lines) ======')
print(y_train[:5])

Total: 575061 instances, 16838 anomaly, 558223 normal
Train: 402542 instances, 11786 anomaly, 390756 normal
Test: 172519 instances, 5052 anomaly, 167467 normal

["['09a53393', '09a53393', '09a53393', '3d91fa85', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '2e68ccc3', 'dba996ef']"
 "['3d91fa85', '09a53393', '09a53393', '09a53393', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', 'e3df2680', '5d5de21c', '5d5de21c', 'd38aa58d', '81cee340', '626085d5', '81cee340', '32777b38', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '8f2bc724', '5d5de21c', 'dba996ef']"
 "['09a53393', '3d91fa85', '09a53393', 'bcc910df']"
 "['09a53393', '09a53393', '3d91fa85', 'bcc910df']"
 "['09a53393', '3d91fa85', '09a53393', 'bcc910df']"]
[1 1 1 1 1]


The **transform_train_data** function takes a collection of log sequences X_seq as input. It counts the occurrences of each event in each log sequence and stores these counts in a structured format. It then returns two things: a feature matrix X where rows represent log sequences and columns represent event counts, and a list of event names events. This transformation allows the log data to be used in machine learning models for anomaly detection.

Row = block_id
Colum = Event Type
x_train[i][j] = nb of event #j in block_id #i 

# Feature extraction

## Feature extraction : Message Count Matrix = Message Count Vector for each block_id

In [7]:
def transform_train_data(X_seq):
    X_counts = []
    for i in range(X_seq.shape[0]):
        event_counts = Counter(X_seq[i])
        X_counts.append(event_counts)
    X_df = pd.DataFrame(X_counts)
    X_df = X_df.fillna(0)
    events = X_df.columns
    X = X_df.values
    return (X, events)

transformed = transform_train_data(x_train)
x_train_mcv = transformed[0]
# The events in the training data,
# used later to ingore the events in the test data that is unseen in the training data
events_mcv = transformed[1]

print('====== Transformed train data summary ======')
print('Train data shape: {}-by-{}\n'.format(x_train_mcv.shape[0], x_train_mcv.shape[1]))
print(x_train_mcv[:5])

Train data shape: 402542-by-21

[[ 1. 40.  6. 13. 13. 13. 23. 19. 19. 22.  7. 10. 11. 13.  7. 13.  6.  3.
   1.  0.  0.]
 [ 1. 50.  9. 13. 13. 17. 26. 24. 24. 25. 10. 11. 15. 17. 11. 14.  7.  5.
   1.  3.  4.]
 [ 1.  8.  3.  6.  3.  3.  7.  3.  3.  2.  2.  2.  1.  0.  0.  0.  2.  1.
   1.  0.  0.]
 [ 1.  8.  3.  6.  3.  3.  7.  3.  3.  2.  2.  2.  1.  0.  0.  0.  2.  1.
   1.  0.  0.]
 [ 1.  8.  3.  6.  3.  3.  7.  3.  3.  2.  2.  2.  1.  0.  0.  0.  2.  1.
   1.  0.  0.]]


In [8]:
def transform_test_data(X_seq, events):
    X_counts = []
    for i in range(X_seq.shape[0]):
        event_counts = Counter(X_seq[i])
        X_counts.append(event_counts)
    X_df = pd.DataFrame(X_counts)
    X_df = X_df.fillna(0)
    # treat the counts of the missing events as 0s
    empty_events = set(events) - set(X_df.columns)
    for event in empty_events:
        X_df[event] = [0] * len(X_df)
    X = X_df[events].values
    return X

x_test_mcv = transform_test_data(x_test, events_mcv)

print('====== Transformed test data summary ======')
print('Test data shape: {}-by-{}\n'.format(x_test_mcv.shape[0], x_test_mcv.shape[1]))
print(x_test_mcv[:5])

Test data shape: 172519-by-21

[[ 1. 58. 12. 18. 16. 20. 29. 28. 28. 27. 11. 11. 14. 19. 10. 19.  9.  6.
   1.  5.  6.]
 [ 1.  4.  1.  3.  2.  2.  4.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.
   1.  0.  0.]
 [ 1. 54.  9. 18. 16. 18. 29. 26. 26. 27. 10. 12. 11. 16.  9. 19.  7.  6.
   1.  3.  6.]
 [ 1.  4.  1.  3.  2.  2.  4.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.
   1.  0.  0.]
 [ 1. 40.  7. 14. 13. 13. 22. 19. 19. 23.  7. 11. 10. 12.  8. 12.  4.  3.
   1.  1.  0.]]


## Feature extraction : TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer() 

def transform_train_data_tfidf(X_seq):
    tokenized_sequences = [eval(x) for x in X_seq]

    # Join the tokens back into space-separated strings (required for TF-IDF)
    document_strings = [' '.join(x) for x in tokenized_sequences]

    # Initialize the TF-IDF vectorizer
#     tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the event sequences into TF-IDF vectors
    tfidf_matrix = tfidf_vectorizer.fit_transform(document_strings)

    # Convert the TF-IDF matrix to a DataFrame for further analysis if needed
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    return tfidf_df

# Assuming x_train contains your training data
x_train_tf_idf = transform_train_data_tfidf(x_train)

print('====== Transformed test data summary -- TF-IDF ======')
print('Test data shape: {}-by-{}\n'.format(x_train_tf_idf.shape[0], x_train_tf_idf.shape[1]))
print(x_train_tf_idf[:5])

Test data shape: 402542-by-48

   0567184d  06d16156  09a53393  0f86472a  124068c6  13eb7010  234302e6  \
0       0.0       0.0  0.305852       0.0       0.0       0.0       0.0   
1       0.0       0.0  0.236314       0.0       0.0       0.0       0.0   
2       0.0       0.0  0.305223       0.0       0.0       0.0       0.0   
3       0.0       0.0  0.305223       0.0       0.0       0.0       0.0   
4       0.0       0.0  0.305223       0.0       0.0       0.0       0.0   

   2e68ccc3  2ecc047e  2f85639c  ...  d63ef163  d6b7b743  dba996ef  e024fa48  \
0  0.578839       0.0       0.0  ...  0.370805       0.0  0.369753       0.0   
1  0.000000       0.0       0.0  ...  0.286500       0.0  0.285688       0.0   
2  0.000000       0.0       0.0  ...  0.000000       0.0  0.000000       0.0   
3  0.000000       0.0       0.0  ...  0.000000       0.0  0.000000       0.0   
4  0.000000       0.0       0.0  ...  0.000000       0.0  0.000000       0.0   

   e3df2680  f0d1ff15  f266840a  f798

In [21]:
def transform_test_data_tfidf(X_seq, tfidf_vectorizer):
    tokenized_sequences = [eval(x) for x in X_seq]

    # Join the tokens back into space-separated strings (required for TF-IDF)
    document_strings = [' '.join(x) for x in tokenized_sequences]

    # Transform the test data using the same TF-IDF vectorizer as used for training
    tfidf_matrix = tfidf_vectorizer.transform(document_strings)

    # Convert the TF-IDF matrix to a DataFrame for further analysis if needed
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    return tfidf_df

# Assuming x_test contains your test data
x_test_tf_idf = transform_test_data_tfidf(x_test, tfidf_vectorizer)

print('====== Transformed test data summary -- TF-IDF ======')
print('Test data shape: {}-by-{}\n'.format(x_test_tf_idf.shape[0], x_test_tf_idf.shape[1]))
print(x_test_tf_idf[:5])

Test data shape: 172519-by-48

   0567184d  06d16156  09a53393  0f86472a  124068c6  13eb7010  234302e6  \
0       0.0       0.0  0.253114       0.0       0.0       0.0       0.0   
1       0.0       0.0  0.707107       0.0       0.0       0.0       0.0   
2       0.0       0.0  0.264222       0.0       0.0       0.0       0.0   
3       0.0       0.0  0.707107       0.0       0.0       0.0       0.0   
4       0.0       0.0  0.275701       0.0       0.0       0.0       0.0   

   2e68ccc3  2ecc047e  2f85639c  ...  d63ef163  d6b7b743  dba996ef  e024fa48  \
0       0.0       0.0       0.0  ...  0.230150  0.361142  0.305997       0.0   
1       0.0       0.0       0.0  ...  0.000000  0.000000  0.000000       0.0   
2       0.0       0.0       0.0  ...  0.320335  0.376992  0.319426       0.0   
3       0.0       0.0       0.0  ...  0.000000  0.000000  0.000000       0.0   
4       0.0       0.0       0.0  ...  0.334252  0.000000  0.333304       0.0   

   e3df2680  f0d1ff15  f266840a  f798

In [22]:
x_train = x_train_mcv
x_test = x_test_mcv

# Classique Machine learning

## Logistic regression model

In [11]:
# lr_classifier = LogisticRegression(penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=1000)

# lr_classifier.fit(x_train_mcv, y_train)

### Evaluate the performance of the logistic regression model

We evaluate the model's performance on the testing set.

In [12]:
# print('Test validation:')
# print('====== Evaluation summary ======')
# y_test_pred_lr = lr_classifier.predict(x_test_mcv)

# precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred_lr, average='binary')
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))

In [13]:
# y_test_pred_proba_lr = lr_classifier.predict_proba(x_test_mcv)[:,1] # predicted probabilities for being "anomaly"

# precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred_proba_lr)

# plt.step(recall, precision, color='b', alpha=0.2,
#          where='post')
# plt.fill_between(recall, precision, step='post', alpha=0.2,
#                  color='b')

# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.ylim([0.0, 1.05])
# plt.xlim([0.0, 1.0])
# plt.title('2-class Precision-Recall curve')

In [14]:
# y_test_pred_proba_lr = lr_classifier.predict_proba(x_test_mcv)[:,1] # predicted probabilities for being "anomaly"

In [15]:
# def plot_roc_curve(fpr, tpr):
#     plt.plot(fpr, tpr, color='orange', label='ROC')
#     plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.title('Receiver Operating Characteristic (ROC) Curve')
#     plt.legend()
#     plt.show()

# fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba_lr)
# plot_roc_curve(fpr, tpr)

In [16]:
# roc_auc = roc_auc_score(y_test, y_test_pred_proba_lr)
# print('AUC: %.3f\n' % roc_auc)

## Support Vector Machine (SVM)

In [17]:
# svm_classifier = SVC(kernel='linear', C=100, random_state=42, max_iter=1000, class_weight=None, penalty=12)

# # Fit the SVM model on the training data
# svm_classifier.fit(x_train_mcv, y_train)

In [18]:
# print('Test validation:')
# print('====== Evaluation summary ======')
# y_pred = svm_classifier.predict(x_test_mcv)

# precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred_lr, average='binary')
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))

## Decision Tree

In [19]:
# dt_classifier = DecisionTreeClassifier(criterion='gini', max_depth=None,
#                           max_features=None, class_weight=None)

# # Fit the Decision Tree model on the training data
# # dt_classifier.fit(x_train_mcv, y_train)

In [20]:
# print('Test validation:')
# print('====== Evaluation summary ======')
# y_test_pred_dt = dt_classifier.predict(x_test_mcv)

# precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred_dt, average='binary')
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))

## Random Forest

## Classical models

In [None]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', C=1.0, random_state=42),
    'DecisionTree': DecisionTreeClassifier(criterion='gini', max_depth=None,
                          max_features=None, class_weight=None),
    'LogisticRegression': LogisticRegression(penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=1000)
}

results = []

# Iterate over models
for model_name, model in models.items():
    # Fit the model
    model.fit(x_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(x_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
    
    # Append results to the list
    results.append({'Model': model_name, 'Accuracy': accuracy, 'F1 Score': f1, 'Precision': precision, 'AUC': auc})

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('result/HDFS/model_metrics.csv', index=False)