In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'./data/Linux_v2_2.csv')
df.head()

Unnamed: 0,Date,Time,Hostname,Message,Severity,Process,PID,User,IP,Len
0,2019-06-09,06:06:20,combo,restart.,0,syslogd 1.4.1,,root,127.0.0.1,8
1,2019-06-09,06:06:20,combo,syslogd startup succeeded,0,syslog,,root,127.0.0.1,25
2,2019-06-09,06:06:20,combo,klogd startup succeeded,0,syslog,,root,127.0.0.1,23
3,2019-06-09,06:06:20,combo,"klogd 1.4.1, log source = /proc/kmsg started.",0,kernel,,root,127.0.0.1,45
4,2019-06-09,06:06:20,combo,Linux version 2.6.5-1.358 (bhcompile@bugs.buil...,0,kernel,,root,127.0.0.1,143


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25706 entries, 0 to 25705
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      25706 non-null  object
 1   Time      25706 non-null  object
 2   Hostname  25706 non-null  object
 3   Message   25613 non-null  object
 4   Severity  25706 non-null  int64 
 5   Process   25688 non-null  object
 6   PID       11176 non-null  object
 7   User      25706 non-null  object
 8   IP        25706 non-null  object
 9   Len       25706 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 2.0+ MB


In [4]:
# convert the 'Date' column to datetime format
df['Date'] = df['Date'].astype('datetime64[ns]')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25706 entries, 0 to 25705
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      25706 non-null  datetime64[ns]
 1   Time      25706 non-null  object        
 2   Hostname  25706 non-null  object        
 3   Message   25613 non-null  object        
 4   Severity  25706 non-null  int64         
 5   Process   25688 non-null  object        
 6   PID       11176 non-null  object        
 7   User      25706 non-null  object        
 8   IP        25706 non-null  object        
 9   Len       25706 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 2.0+ MB


In [5]:
print(df.shape)

(25706, 10)


## Handling Null values

In [6]:
df.isnull().sum()

Date            0
Time            0
Hostname        0
Message        93
Severity        0
Process        18
PID         14530
User            0
IP              0
Len             0
dtype: int64

In [7]:
df.dropna(subset=['Message'], inplace=True)

## Data Cleaning

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

en_stopwords = set(stopwords.words('english')) 

def clean_content(message: str):

    try:
        # Converting content to lowercase
        message = message.lower()
        # Replace date time string to word TIMESTAMP
        message = re.sub(r"[a-z]{3} [a-z]{3} [0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} [0-9]{4}", 'TIMESTAMP', message)
        # Replace ip-address string to word IPADDRESS
        message = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", 'IPADDRESS', message)
        # Replace hexa decimal string to word HEXCODE
        message = re.sub(r"\b0x[0-9A-F]+\b", 'HEXCODE', message)
        # Replace numbers to word NUMBER
        message = re.sub(r"\b\d{1,3}\b", 'NUMBER', message)
        # Replace URL with word URLORDOMAIN
        message = re.sub(r'(?:https?:\/\/)?([a-zA-Z][a-zA-Z0-9._\-]+.[a-zA-Z0-9]+(\.[a-zA-Z0-9._\-]{2,3})+)', 'URLORDOMAIN', message)
        # Remove characters apart from alphabets
        message = re.sub('([^A-Za-z ])+', ' ', message)
        # Remove words less than 3 characters
        message = re.sub(r'(\b.{1,2}\s)+', ' ', message)
        # Remove mulitiple whitespace
        message = re.sub(r' +', ' ', message)

        # Removing stopwords & perform stemming
        porter = PorterStemmer()

        clean_str = []
        word_tokens = nltk.word_tokenize(message)
        for word in word_tokens: 
            if word not in en_stopwords:
                clean_str.append(porter.stem(word))
        
        message = ' '.join(clean_str)

        

    except:
        print('Error')
    
    
    return message

In [9]:
df['Message'] = df['Message'].apply(clean_content)
df.sample(10)

Unnamed: 0,Date,Time,Hostname,Message,Severity,Process,PID,User,IP,Len
18486,2019-12-03,21:54:08,combo,memori kill process python,1,kernel,,root,127.0.0.1,45
3388,2019-08-02,11:24:51,combo,check pass user unknown,0,sshd(pam_unix),[25508],root,127.0.0.1,24
3530,2019-08-07,06:52:07,combo,connect ipaddress urlordomain sun aug number n...,0,ftpd,[16260],root,82.53.83.190,96
17804,2019-12-03,10:46:21,combo,memori kill process httpd,1,kernel,,root,127.0.0.1,44
939,2019-06-13,20:13:11,combo,authent failur lognam uid number euid number t...,1,sshd(pam_unix),[17428],root,127.0.0.1,97
14241,2019-11-24,06:15:28,combo,memori kill process python,1,kernel,,root,127.0.0.1,44
16842,2019-11-30,07:05:36,combo,memori kill process httpd,1,kernel,,root,127.0.0.1,43
10522,2019-11-21,00:31:40,combo,memori kill process httpd,1,kernel,,root,127.0.0.1,44
11659,2019-11-21,09:33:32,combo,authent failur lognam uid number euid number t...,1,sshd(pam_unix),[28386],root,203.251.225.152,98
5297,2019-09-19,05:57:41,combo,charg number unknown,0,apmd,[1748],root,127.0.0.1,27


### Examples

In [10]:
df.loc[22339]['Message']

'connect ipaddress timestamp'

In [11]:
message = 'User unknown timed out after 900 seconds at Sat Jun 18 02:23:10 2005 '
clean_content(message)

'user unknown time number second timestamp'

In [12]:
message = 'User unknown timed out after 900 seconds at Sat Jun 18 02:23:10 2005 '
re.sub(r"[A-Z][a-z]{2} [A-Z][a-z]{2} [0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} [0-9]{4}", 'TIMESTAMP', message)

'User unknown timed out after 900 seconds at TIMESTAMP '

### Remove duplicate messages

In [15]:
clean_df = df.drop_duplicates(subset='Message', keep = 'first')
clean_df.sample(10)

Unnamed: 0,Date,Time,Hostname,Message,Severity,Process,PID,User,IP,Len
23655,2020-01-26,12:22:14,combo,urlordomainb avc deni search pid number exe us...,1,kernel,,root,127.0.0.1,195
7335,2019-10-23,12:40:02,combo,authent fail ipaddress urlordomain softwar cau...,1,klogind,[22519],root,81.181.213.88,107
5992,2019-09-28,09:10:48,combo,bring interfac eth succeed,0,network,,root,127.0.0.1,39
23954,2020-01-26,12:23:04,combo,urlordomainb avc deni name bind pid exe usr sb...,1,kernel,,root,127.0.0.1,184
23684,2020-01-26,12:22:18,combo,urlordomainb syscal number exit number fefff c...,0,kernel,,root,127.0.0.1,176
205,2019-06-09,06:06:34,combo,bluetooth hci socket layer initi,0,kernel,,root,127.0.0.1,39
18696,2019-12-04,03:49:00,combo,clear page tabl f,1,kernel,,root,127.0.0.1,41
35,2019-06-09,06:06:20,combo,selinux initi,0,kernel,,root,127.0.0.1,23
8220,2019-11-10,16:00:31,combo,hda dma timeout retri,0,kernel,,root,127.0.0.1,22
23792,2020-01-26,12:22:30,combo,urlordomainb avc deni read pid exe usr sbin ku...,1,kernel,,root,127.0.0.1,206


In [17]:
clean_df.shape

(858, 10)