In [38]:
import pandas as pd
import re

# Define regex patterns
regex_patterns = {
    "timestamp": r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}\+\d{4}',
    "event_type": r'(Session|IncomingRequest|SuccessfulLogin|FailedLogin|AuthorisedAccess)',
    "user": r'(admin|user_dl|user_hp|phone1)',
    "ip": r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
}

# Define a function to parse each line of the text
def parse_line(line):
    # Use regular expressions to extract relevant information
    match = re.match(regex_patterns["timestamp"], line)
    if match:
        timestamp = match.group()
        match = re.search(regex_patterns["event_type"], line)
        event_type = match.group() if match else None
        match = re.search(regex_patterns["user"], line)
        user = match.group() if match else None
        match = re.search(regex_patterns["ip"], line)
        ip = match.group() if match else None

        return {
            'Timestamp': timestamp,
            'Event_Type': event_type,
            'User': user,
            'IP': ip
        }
    else:
        return None

# Read the text file and parse each line
with open('auth.log', 'r') as file:
    lines = file.readlines()

# Filter out None values (lines that couldn't be parsed)
parsed_data = [parse_line(line) for line in lines]
parsed_data = [data for data in parsed_data if data is not None]

# Create DataFrame
df = pd.DataFrame(parsed_data)

# Display DataFrame


In [52]:
df['IP'] = df['IP'].replace('192.168.1.144', '192.168.70.173')

In [53]:
df

Unnamed: 0,Timestamp,Event_Type,User,IP,Hour
0,2024-05-08 15:08:06.221877+04:00,IncomingRequest,,127.0.0.1,15
1,2024-05-08 15:08:06.222209+04:00,Session,,127.0.0.1,15
2,2024-05-08 15:11:39.291316+04:00,SuccessfulLogin,admin,127.0.0.1,15
3,2024-05-08 15:11:43.046072+04:00,Session,,127.0.0.1,15
4,2024-05-08 15:11:43.046944+04:00,AuthorisedAccess,admin,127.0.0.1,15
...,...,...,...,...,...
12646,2024-05-08 16:40:05.462619+04:00,,phone1,,16
12647,2024-05-08 16:43:32.066490+04:00,IncomingRequest,,192.168.1.250,16
12648,2024-05-08 16:43:32.066861+04:00,Session,,192.168.1.250,16
12649,2024-05-08 16:43:48.566778+04:00,SuccessfulLogin,phone1,192.168.1.250,16


In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

# Assume df is your original DataFrame
# Create a new DataFrame for transformed features
df_transformed = pd.DataFrame()

# Feature Engineering
# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()
df_transformed['Event_Type_Encoded'] = label_encoder.fit_transform(df['Event_Type'])
df_transformed['User_Encoded'] = label_encoder.fit_transform(df['User'])
df_transformed['IP_Encoded'] = label_encoder.fit_transform(df['IP'])

# Define features
features = ['Event_Type_Encoded', 'User_Encoded', 'IP_Encoded']
X = df_transformed[features]

# Data Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Training
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

model = IsolationForest(contamination=0.1, random_state=42)
model.fit(X_train)

# Model Evaluation
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Convert predictions to binary (1 for inliers, -1 for outliers)
y_pred_train_binary = [1 if x == 1 else 0 for x in y_pred_train]
y_pred_test_binary = [1 if x == 1 else 0 for x in y_pred_test]

# Calculate AUC-ROC score
anomaly_scores_train = model.decision_function(X_train)
anomaly_scores_test = model.decision_function(X_test)

# Average anomaly score (the lower, the more anomalous)
avg_anomaly_score_train = anomaly_scores_train.mean()
avg_anomaly_score_test = anomaly_scores_test.mean()

print("Average Anomaly Score (Train):", avg_anomaly_score_train)
print("Average Anomaly Score (Test):", avg_anomaly_score_test)


Average Anomaly Score (Train): 0.00011810188640662953
Average Anomaly Score (Test): 0.0002768694110994298


In [55]:
df_transformed['IP_Encoded'].unique()

array([0, 3, 2, 4, 5, 1])

In [73]:
# 1. Event Type Distribution
event_type_distribution = df['Event_Type'].value_counts()
print("Event Type Distribution:\n", event_type_distribution, "\n")

# 2. User Activity
user_activity = df.groupby('User').size()
print("User Activity:\n", user_activity, "\n")

# 3. IP Analysis
ip_analysis = df['IP'].value_counts()
print("IP Analysis:\n", ip_analysis, "\n")

# 4. Temporal Patterns
df['Hour'] = df['Timestamp'].dt.hour
hourly_activity = df.groupby('Hour').size()
print("Hourly Activity:\n", hourly_activity, "\n")

# 5. Event Correlation (Example: Successful Login followed by Authorised Access)
success_login_indices = df[df['Event_Type'] == 'SuccessfulLogin'].index
success_login_indices_plus_2 = success_login_indices + 2

# Filter out indices that go beyond the DataFrame's index range
valid_indices = success_login_indices_plus_2[success_login_indices_plus_2 < len(df)]

success_auth_access = df.loc[valid_indices][df.loc[valid_indices, 'Event_Type'] == 'AuthorisedAccess']
print("Successful Login followed by Authorised Access:\n", success_auth_access, "\n")



Event Type Distribution:
 Event_Type
Session             6319
IncomingRequest     6314
SuccessfulLogin       14
FailedLogin            2
AuthorisedAccess       1
Name: count, dtype: int64 

User Activity:
 User
admin      11
phone1      2
user_dl     3
user_hp     2
dtype: int64 

IP Analysis:
 IP
192.168.70.173    12612
127.0.0.1            17
192.168.1.253        10
192.168.1.53          7
192.168.1.250         4
Name: count, dtype: int64 

Hourly Activity:
 Hour
15    12637
16       14
dtype: int64 

Successful Login followed by Authorised Access:
                          Timestamp        Event_Type   User         IP  Hour
4 2024-05-08 15:11:43.046944+04:00  AuthorisedAccess  admin  127.0.0.1    15 

