In [1]:
import re
import pandas as pd
from datetime import datetime

# Initialize a list to store the structured logs
list_of_log = []

# Open and read the log file
log_file_path = "augmented_challenging_validate_logs.log" 
with open(log_file_path, 'r') as f:
    logs = f.readlines()

# Process each log entry
for log in logs:
    try:
        # Split the log into components
        log_parts = log.split()
        
        # Extract the date, time, and log level (e.g., [notice] or [error])
        date = log_parts[0]
        time = log_parts[1]
        log_level = log_parts[2]
        
        # Check if the log follows the format with '#1:' or with a different pattern
        if "#1:" in log:
            event_number = log.split("#1:")[0].split()[3]  # Extract event number
            message = log.split("#1:")[1].strip()  # The actual log message
        else:
            # Handle cases where there is no '#1:' in the log
            event_number = log_parts[3] if len(log_parts) > 3 else "N/A"
            message = ' '.join(log_parts[4:]) if len(log_parts) > 4 else "N/A"
        
        # Rebuild the structured log entry
        structured_log = f'"{date} {time} {log_level} {event_number} #1: {message}",'
        list_of_log.append(structured_log)
    
    except IndexError:
        # Handle cases where the log format is unexpected
        print(f"Skipping malformed log entry: {log}")
        continue

# Convert the list of structured logs into the desired format
for log in list_of_log:
    print(log)  # Printing each structured log in the format you want



"1.118.224.122 - - [15/Oct/2024:13:53:54 #1: +0000] "PUT /admin HTTP/1.1" 302 320 "-" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:107) Gecko/20100101 Firefox/3875"",
"1.162.132.158 - - [15/Oct/2024:13:46:20 #1: +0000] "GET /vulnerable_login HTTP/1.1" 302 223 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.5906.62 Safari/537.36"",
"1.196.201.240 - - [15/Oct/2024:13:46:22 #1: +0000] "POST /profile?user_id=3&extra_param=92 HTTP/1.1" 200 620 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/101 Safari/605.1.15"",
"1.57.69.80 - - [22/Jan/2019:03:57:23 #1: +0000] "GET /image/61474/productModel/200x200 HTTP/1.1" 503 5379 "https://www.zanbil.ir/m/filter/b113" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.4979.14 Safari/537.36"",
"10.0.0.15 - - [15/Oct/2024:14:57:02 #1: +0000] "POST /vulnerable_login HTTP/1.1" 302 250 "-" "curl/7.68.0"",
"10.0.0

In [2]:
import re
import pandas as pd
from datetime import datetime

# Define a function to parse each log line
def parse_log_line(line):
    pattern = (
        r'(?P<ip>[\d.]+) - - \[(?P<timestamp>[^\]]+)\] '
        r'"(?P<method>[A-Z]+) (?P<url>[^ ]+) HTTP/[^"]+" '
        r'(?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>[^"]*)" "(?P<user_agent>[^"]+)"'
    )
    match = re.match(pattern, line)
    if match:
        return match.groupdict()  # Return matched groups as a dictionary
    return None

# Convert timestamp string to datetime
def parse_timestamp(ts):
    try:
        return datetime.strptime(ts, '%d/%b/%Y:%H:%M:%S %z')
    except ValueError:
        return None  # Handle incorrect timestamp format

# Ensure logs are available before processing
if 'logs' in globals() and logs:
    # Process logs and filter out None values after parsing
    log_entries = [parse_log_line(line) for line in logs]
    log_entries = [entry for entry in log_entries if entry is not None]

    # Create a DataFrame from the parsed log entries
    df = pd.DataFrame(log_entries)

    if not df.empty:
        # Convert 'status' to integer and 'timestamp' to datetime
        df['status'] = df['status'].astype(int)
        df['timestamp'] = df['timestamp'].apply(parse_timestamp)

        # Filter out rows with invalid timestamps
        df = df[df['timestamp'].notna()]

        # Detect brute-force attempts: Same IP posts '/vulnerable_login' >10 times in the same minute
        df['time_window'] = df['timestamp'].dt.floor('T')  # Round down to the minute
        brute_force_mask = df.groupby(['ip', 'time_window'])['url'].transform(
            lambda x: (x == '/vulnerable_login').sum() > 10
        )

        # Drop brute-force attempts using the mask
        df = df[~brute_force_mask]

        # Flag suspicious status codes (500, 403, 404, 405)
        df['suspicious_status'] = df['status'].isin([500, 403, 404, 405])

        # Flag bypass attempts: Missing referrer or accessing /profile?user_id=3
        df['bypass'] = (df['referrer'] == '-') & df['url'].str.contains(r'/profile\?user_id=\d')

        # Combine relevant fields into a structured 'logs' column
        df['logs'] = (
            df['ip'] + ' ' + df['method'] + ' ' + df['url'] + ' ' +
            df['status'].astype(str) + ' ' + df['size'].astype(str) + ' ' + df['referrer']
        )

        # Add labels: 2 -> bypass, 1 -> suspicious_status, 0 -> normal
        df['label'] = df.apply(
            lambda row: 2 if row['bypass'] 
            else 1 if row['suspicious_status'] 
            else 0, axis=1
        )

        # Select only the relevant columns for output
        output_df = df[['logs', 'label']]

        # Display the final DataFrame
        print(output_df)

        # Save the DataFrame to a CSV file
        output_df.to_csv('validate_log_analysis_output.csv', index=False)

    else:
        print("No valid log entries found to process.")

else:
    print("Log data is empty or not defined.")


                                                  logs  label
0                   1.118.224.122 PUT /admin 302 320 -      0
1        1.162.132.158 GET /vulnerable_login 302 223 -      0
2    1.196.201.240 POST /profile?user_id=3&extra_pa...      2
3    1.57.69.80 GET /image/61474/productModel/200x2...      0
4           10.0.0.15 POST /vulnerable_login 302 250 -      0
..                                                 ...    ...
743  98.213.82.149 DELETE /image/60844/productModel...      0
744     98.233.47.98 PATCH /vulnerable_login 500 223 -      1
745    99.216.107.229 POST /vulnerable_login 502 223 -      0
746  99.243.84.225 POST /vulnerable_login 503 223 h...      0
747   99.32.122.110 DELETE /vulnerable_login 404 223 -      1

[735 rows x 2 columns]


  df['time_window'] = df['timestamp'].dt.floor('T')  # Round down to the minute


In [3]:
import joblib
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

# Load the saved SVM model and TF-IDF vectorizer
svm_model = joblib.load('fine_tuned_ensemble_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Load your validation data (ensure the path is correct)
df_validation = pd.read_csv('validate_log_analysis_output.csv')

# Transform the 'logs' column using the loaded vectorizer
X_validation = vectorizer.transform(df_validation['logs'])

# Extract the true labels from the 'label' column
y_validation = df_validation['label']

# Use the loaded model to make predictions on the validation data
y_pred = svm_model.predict(X_validation)

# Calculate the accuracy and generate the classification report
accuracy = accuracy_score(y_validation, y_pred)
report = classification_report(y_validation, y_pred)

# Print the results
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)


Validation Accuracy: 0.9497
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       546
           1       0.92      1.00      0.96       156
           2       1.00      0.12      0.22        33

    accuracy                           0.95       735
   macro avg       0.96      0.70      0.72       735
weighted avg       0.95      0.95      0.93       735

