In [1]:
import pandas as pd
import re
import datefinder
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Welcome\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
df = pd.read_csv("csv/merged_csv.csv")

In [4]:
df

Unnamed: 0,category,log,dataset_variant,entropy_p
0,authentication-failed,[Tue Apr 11 14:36:11 2000] [error] [client 28....,output_0.1.log,0.1
1,authentication-failed,[Tue Jan 21 17:01:07 2020] [error] [client 108...,output_0.1.log,0.1
2,authentication-failed,[Thu Oct 12 01:17:44 2023] [ malfunction error...,output_0.1.log,0.1
3,authentication-failed,[Tue Jul 30 16:18:08 2013] [error] [client 217...,output_0.1.log,0.1
4,authentication-failed,[Thu Sep 10 05:09:58 2015] [error] [client 2.5...,output_0.1.log,0.1
...,...,...,...,...
169946,user-session-open,Jan 10 09:18:57 localhost su[6578]: (pam_unix)...,output_0.log,0.0
169947,user-session-open,Jun 03 12:18:54 localhost su[6220]: (pam_unix)...,output_0.log,0.0
169948,user-session-open,Apr 22 17:55:54 localhost su[5883]: (pam_unix)...,output_0.log,0.0
169949,user-session-open,Feb 06 05:17:26 localhost su[5934]: (pam_unix)...,output_0.log,0.0


In [6]:
def preprocess_log(log_text):
    try: 
        matches = list(datefinder.find_dates(log_text,source=True))
        if matches:
            for _,match_string in matches:
                log_text = log_text.replace(match_string," ")
    except Exception:
        pass

    #print("------------------------------------------------------------------------------------------------------------------------------------")
    #print(f"After Date/Time Removal: {log_text.strip()}\n")

    ip_pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
    url_pattern = r'https?://\S+|www\.\S+'
    path_pattern = r'[a-zA-Z]:[\\/](?:[a-zA-Z0-9_ -]+[\\/])*(?:[a-zA-Z0-9_ -]+\.\w+)'

    log_text = re.sub(ip_pattern, '', log_text)
    log_text = re.sub(url_pattern, '', log_text)
    log_text = re.sub(path_pattern, '', log_text)

    log_text = re.sub(r'[^a-zA-Z\s]', '', log_text).lower()

    tokens =log_text.split()
    #print(f"After Data Cleaning & Tokenization: {tokens}\n")

    stop_words = set(stopwords.words("english"))
    tokens_no_stopwords = [word for word in tokens if word not in stop_words]
    #print(f"After Stopword Removal: {tokens_no_stopwords}\n")

    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(token) for token in tokens_no_stopwords]
    #print(f"Final Stemmed Tokens: {stemmed_tokens}\n")
    #print("------------------------------------------------------------------------------------------------------------------------------------")

    return " ".join(stemmed_tokens)

In [7]:
df["processed_log"] = df["log"].apply(preprocess_log)



In [8]:
print("Log preprocessing complete. Sample preprocessed_log:")
print(df.head(2))

Log preprocessing complete. Sample preprocessed_log:
                category                                                log  \
0  authentication-failed  [Tue Apr 11 14:36:11 2000] [error] [client 28....   
1  authentication-failed  [Tue Jan 21 17:01:07 2020] [error] [client 108...   

  dataset_variant  entropy_p  \
0  output_0.1.log        0.1   
1  output_0.1.log        0.1   

                                       processed_log  
0  error client user jessicakais authent failur h...  
1  error client user mejianathan authent failur p...  


In [9]:
df.to_csv("preprocessed_df",index=True)