In [12]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow import keras

In [13]:
import re
import pandas as pd
import os

# Fonction pour analyser une ligne de log
def parse_log_line(line):
    # Définir plusieurs patterns pour différents formats de logs
    patterns = [
        re.compile(r'(?P<date>\w{3} \d{2} \d{2}:\d{2}:\d{2}) (?P<host>\S+) (?P<service>\S+)\[(?P<pid>\d+)\]: (?P<message>.+)'),
        re.compile(r'(?P<date>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{4}) (?P<host>\S+) (?P<service>\S+)\[(?P<pid>\d+)\]: (?P<message>.+)'),
        re.compile(r'(?P<date>\w{3} \d{2} \d{2}:\d{2}:\d{2}) (?P<host>\S+) (?P<service>\S+): (?P<message>.+)'),
        re.compile(r'\[(?P<date>[\d\.]+)\] (?P<message>.+)'),
        re.compile(r'(?P<host>\S+) (?P<service>\S+)\[(?P<pid>\d+)\]: (?P<message>.+)'),
        #re.compile(r'(?P<message>.+)'),
        re.compile(r'(?P<date>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{4}) (?P<host>\S+) (?P<service>\S+): (?P<message>.+)'),
        re.compile(r'(?P<date>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z) level=(?P<level>\w+) msg="(?P<message>.+)"'),
        re.compile(r'(?P<host>\S+) (?P<service>\S+)\[(?P<pid>\d+)\]: (?P<message>.+)')
    ]
    
    for pattern in patterns:
        match = pattern.match(line)
        if match:
            log_dict = match.groupdict()
            
            # Nettoyage spécifique pour certains services
            if 'service' in log_dict:
                if log_dict['service'].startswith('balena-engine-daemon'):
                    message = log_dict['message']
                    message_cleaned = re.sub(r'time="[^"]+" ', '', message)  # Suppression des timestamps
                    message_cleaned = re.sub(r'namespace=[^ ]+ ', '', message_cleaned)  # Suppression des namespaces
                    log_dict['message'] = message_cleaned
                
                elif log_dict['service'].startswith('python3'):
                    message = log_dict['message']
                    message_cleaned = re.sub(r'\[pid \d+\] ', '', message)  # Suppression des PIDs
                    log_dict['message'] = message_cleaned

                elif log_dict['service'].startswith('chilli'):
                    message = log_dict['message']
                    message_cleaned = re.sub(r'chilli\[\d+\]: ', '', message)  # Suppression des PIDs et du nom du service
                    log_dict['message'] = message_cleaned
            
            return log_dict
    
    return None

def save_logs_to_csv(logs, output_file):
    # Convertir la liste de dictionnaires en DataFrame
    df = pd.DataFrame(logs)
    # Sauvegarder le DataFrame dans un fichier CSV
    df.to_csv(output_file, index=False)

# Fonction pour lire tous les fichiers de logs dans un dossier et les étiqueter
def process_log_files(directory, label):
    logs = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Filtrer les fichiers de log
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                log_lines = file.readlines()
            parsed_logs = [parse_log_line(line) for line in log_lines if parse_log_line(line) is not None]
            for log in parsed_logs:
                log['label'] = label
            logs.extend(parsed_logs)
    return logs

# Exemple d'utilisation
log_directory_1 = 'label1'  # Dossier contenant les fichiers de logs à étiqueter avec 1
log_directory_0 = 'label0'  # Dossier contenant les fichiers de logs à étiqueter avec 0

logs_label_1 = process_log_files(log_directory_1, 1)
logs_label_0 = process_log_files(log_directory_0, 0)

all_logs = logs_label_1 + logs_label_0
save_logs_to_csv(all_logs, 'all_logs.csv')


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Charger les données
df = pd.read_csv('all_logs.csv')

# Préparation des données
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['message']).toarray()
y = df['label']

# Appliquer SMOTE pour suréchantillonner les données
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Convertir les tableaux numpy en DataFrames
X_res_df = pd.DataFrame(X_res, columns=vectorizer.get_feature_names_out())
y_res_df = pd.DataFrame(y_res, columns=['label'])

# Recombiner les données suréchantillonnées en DataFrame
df_res = pd.concat([X_res_df, y_res_df], axis=1)

# Sauvegarder les données suréchantillonnées dans un nouveau fichier CSV
df_res.to_csv('logs_resampled.csv', index=False)


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report, confusion_matrix

# Charger les données suréchantillonnées
df_res = pd.read_csv('logs_resampled.csv')

# Préparation des données
X = df_res.drop('label', axis=1).values
y = df_res['label'].values

# Diviser les données en ensembles d'entraînement et de test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Construire le modèle de réseau de neurones
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

# Compiler le modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entraîner le modèle
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Évaluer les performances du modèle
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9158 - loss: 0.3052 - val_accuracy: 0.9989 - val_loss: 0.0037
Epoch 2/10
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9976 - loss: 0.0117 - val_accuracy: 0.9995 - val_loss: 0.0029
Epoch 3/10
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9989 - loss: 0.0059 - val_accuracy: 0.9995 - val_loss: 0.0033
Epoch 4/10
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9981 - loss: 0.0058 - val_accuracy: 0.9995 - val_loss: 0.0032
Epoch 5/10
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9982 - loss: 0.0072 - val_accuracy: 0.9989 - val_loss: 0.0038
Epoch 6/10
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9987 - loss: 0.0055 - val_accuracy: 0.9995 - val_loss: 0.0034
Epoch 7/10
[1m232/232[0m [32m━━━━━━━

In [16]:
# Faire des prédictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Afficher le rapport de classification
print(classification_report(y_test, y_pred))

# Afficher la matrice de confusion
print(confusion_matrix(y_test, y_pred))

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       525
           1       1.00      1.00      1.00       505

    accuracy                           1.00      1030
   macro avg       1.00      1.00      1.00      1030
weighted avg       1.00      1.00      1.00      1030

[[524   1]
 [  0 505]]


# Save The Model

In [18]:
import joblib

# Sauvegarder le modèle
model.save('log_model.h5')

# Sauvegarder le vectorizer
joblib.dump(vectorizer, 'log_vectorizer.pkl')



['log_vectorizer.pkl']