In [1]:
import re
import pandas as pd
from datetime import datetime

# Initialize a list to store the structured logs
list_of_log = []

# Open and read the log file
log_file_path = "augmented_train_logs.log"
with open(log_file_path, 'r') as f:
    logs = f.readlines()

# Process each log entry
for log in logs:
    try:
        # Split the log into components
        log_parts = log.split()
        
        # Extract the date, time, and log level (e.g., [notice] or [error])
        date = log_parts[0]
        time = log_parts[1]
        log_level = log_parts[2]
        
        # Check if the log follows the format with '#1:' or with a different pattern
        if "#1:" in log:
            event_number = log.split("#1:")[0].split()[3]  # Extract event number
            message = log.split("#1:")[1].strip()  # The actual log message
        else:
            # Handle cases where there is no '#1:' in the log
            event_number = log_parts[3] if len(log_parts) > 3 else "N/A"
            message = ' '.join(log_parts[4:]) if len(log_parts) > 4 else "N/A"
        
        # Rebuild the structured log entry
        structured_log = f'"{date} {time} {log_level} {event_number} #1: {message}",'
        list_of_log.append(structured_log)
    
    except IndexError:
        # Handle cases where the log format is unexpected
        print(f"Skipping malformed log entry: {log}")
        continue

# Convert the list of structured logs into the desired format
for log in list_of_log:
    print(log)  # Printing each structured log in the format you want



"1.101.24.211 - - [15/Oct/2024:14:45:07 #1: +0000] "PUT /admin?id=98 HTTP/1.1" 200 2526 "-" "Googlebot/2.1 (+http://www.google.com/bot.html)"",
"1.131.225.25 - - [05/Oct/2024:14:15:00 #1: +0000] "GET /profile?id=32 HTTP/1.1" 200 2075 "http://localhost/" "curl/7.64.1"",
"1.141.97.11 - - [05/Oct/2024:17:35:04 #1: +0000] "POST /admin?id=85 HTTP/1.1" 500 4375 "http://localhost/profile" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Mobile/15E148 Safari/604.1"",
"1.185.130.246 - - [15/Oct/2024:16:58:48 #1: +0000] "GET /comments?id=95 HTTP/1.1" 200 3297 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"",
"1.2.117.130 - - [05/Oct/2024:16:55:39 #1: +0000] "POST /profile?id=86 HTTP/1.1" 200 727 "http://localhost/" "Googlebot/2.1 (+http://www.google.com/bot.html)"",
"1.45.100.23 - - [05/Oct/2024:17:19:00 #1: +0000] "POST /comments?id=29 HTTP/1.1" 200 386 "http:/

In [2]:
import re
import pandas as pd
from datetime import datetime

# Define a function to parse each log line
def parse_log_line(line):
    pattern = (
        r'(?P<ip>[\d.]+) - - \[(?P<timestamp>[^\]]+)\] '
        r'"(?P<method>[A-Z]+) (?P<url>[^ ]+) HTTP/[^"]+" '
        r'(?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>[^"]*)" "(?P<user_agent>[^"]+)"'
    )
    match = re.match(pattern, line)
    if match:
        return match.groupdict()  # Return matched groups as a dictionary
    return None

# Convert timestamp string to datetime
def parse_timestamp(ts):
    try:
        return datetime.strptime(ts, '%d/%b/%Y:%H:%M:%S %z')
    except ValueError:
        return None  # Handle incorrect timestamp format

# Ensure logs are available before processing
if 'logs' in globals() and logs:
    # Process logs and filter out None values after parsing
    log_entries = [parse_log_line(line) for line in logs]
    log_entries = [entry for entry in log_entries if entry is not None]

    # Create a DataFrame from the parsed log entries
    df = pd.DataFrame(log_entries)

    if not df.empty:
        # Convert 'status' to integer and 'timestamp' to datetime
        df['status'] = df['status'].astype(int)
        df['timestamp'] = df['timestamp'].apply(parse_timestamp)

        # Filter out rows with invalid timestamps
        df = df[df['timestamp'].notna()]

        # Detect brute-force attempts: Same IP posts '/vulnerable_login' >10 times in the same minute
        df['time_window'] = df['timestamp'].dt.floor('T')  # Round down to the minute
        brute_force_mask = df.groupby(['ip', 'time_window'])['url'].transform(
            lambda x: (x == '/vulnerable_login').sum() > 10
        )

        # Drop brute-force attempts using the mask
        df = df[~brute_force_mask]

        # Flag suspicious status codes (500, 403, 404, 405)
        df['suspicious_status'] = df['status'].isin([500, 403, 404, 405])

        # Flag bypass attempts: Missing referrer or accessing /profile?user_id=3
        df['bypass'] = (df['referrer'] == '-') & df['url'].str.contains(r'/profile\?user_id=\d')

        # Combine relevant fields into a structured 'logs' column
        df['logs'] = (
            df['ip'] + ' ' + df['method'] + ' ' + df['url'] + ' ' +
            df['status'].astype(str) + ' ' + df['size'].astype(str) + ' ' + df['referrer']
        )

        # Add labels: 2 -> bypass, 1 -> suspicious_status, 0 -> normal
        df['label'] = df.apply(
            lambda row: 2 if row['bypass'] 
            else 1 if row['suspicious_status'] 
            else 0, axis=1
        )

        # Select only the relevant columns for output
        output_df = df[['logs', 'label']]

        # Display the final DataFrame
        print(output_df)

        # Save the DataFrame to a CSV file
        output_df.to_csv('log_analysis_output.csv', index=False)

    else:
        print("No valid log entries found to process.")

else:
    print("Log data is empty or not defined.")


                                                   logs  label
0              1.101.24.211 PUT /admin?id=98 200 2526 -      0
1     1.131.225.25 GET /profile?id=32 200 2075 http:...      0
2     1.141.97.11 POST /admin?id=85 500 4375 http://...      1
3          1.185.130.246 GET /comments?id=95 200 3297 -      0
4     1.2.117.130 POST /profile?id=86 200 727 http:/...      0
...                                                 ...    ...
4281  98.162.234.132 PATCH /dashboard?id=28 200 4922...      0
4282       98.187.133.205 GET /comments?id=7 200 3467 -      0
4286        99.174.158.92 PUT /profile?id=15 200 4281 -      0
4287  99.204.210.136 POST /profile?id=16 200 2563 ht...      0
4288     99.229.181.160 GET /dashboard?id=15 200 4783 -      0

[3777 rows x 2 columns]


  df['time_window'] = df['timestamp'].dt.floor('T')  # Round down to the minute


In [143]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC

# Assuming df has already been created from previous steps with 'logs' and 'label' columns

# Step 1: Extract features using TF-IDF
vectorizer = TfidfVectorizer()  # Initialize TF-IDF vectorizer
X = vectorizer.fit_transform(df['logs'])  # Transform the 'logs' column into numerical features
y = df['label']  # Labels (targets)

# Step 2: Apply Stratified K-Fold Cross Validation
k = 4  # Number of folds
svm_model = SVC(kernel='linear', C=10)  # Initialize the SVM model with a linear kernel

# Initialize StratifiedKFold to maintain class distribution across folds
skf = StratifiedKFold(n_splits=k)

# Cross-validation
cv_scores = cross_val_score(svm_model, X, y, cv=skf, scoring='accuracy')

# Print cross-validation results
print(f"Cross-Validation Accuracy Scores for {k} folds: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.4f}")

# Step 3: Train the final model on the entire dataset
svm_model.fit(X, y)

# Optionally, you can save the trained model
import joblib
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(svm_model, 'svm_log_classifier_model_with_kfold.pkl')

Cross-Validation Accuracy Scores for 4 folds: [0.92965368 0.95666306 0.9772481  0.98808234]
Mean CV Accuracy: 0.9629
Standard Deviation of CV Accuracy: 0.0223


['svm_log_classifier_model_with_kfold.pkl']

In [144]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
import joblib

# Step 1: Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['logs'])  # Transform 'logs' column to numerical features
y = df['label']  # Labels

# Step 2: Initialize individual models
svm_model = SVC(kernel='linear', C=10, probability=True)  # SVM with probability output
nb_model = MultinomialNB()  # Naive Bayes model

# Step 3: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model)],  # List of models
    voting='soft'  # Use soft voting to average probabilities
)

# Step 4: Apply Stratified K-Fold Cross Validation
k = 4  # Number of folds
skf = StratifiedKFold(n_splits=k)

# Cross-validation
cv_scores = cross_val_score(ensemble_model, X, y, cv=skf, scoring='accuracy')

# Print cross-validation results
print(f"VotingClassifier Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.4f}")

# Step 5: Train the final ensemble model on the entire dataset
ensemble_model.fit(X, y)

# Step 6: Save the ensemble model and vectorizer -> เอา text ไปทำให้มีค่าทางคณิตศาสตร์
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(ensemble_model, 'ensemble_log_classifier_model.pkl')

print("Ensemble model and vectorizer saved successfully.")


VotingClassifier Cross-Validation Accuracy Scores: [0.91666667 0.91007584 0.96424702 0.9772481 ]
Mean CV Accuracy: 0.9421
Standard Deviation of CV Accuracy: 0.0291
Ensemble model and vectorizer saved successfully.


In [145]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import joblib

# Step 1: Extract features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using both unigrams and bigrams
X = vectorizer.fit_transform(df['logs'])  # Transform 'logs' column to numerical features
y = df['label']  # Labels

# Step 2: Initialize individual models
svm_model = SVC(probability=True)  # SVM with probability output
nb_model = MultinomialNB()  # Naive Bayes model

# Step 3: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model)],
    voting='soft'  # Use soft voting to average probabilities
)

# Step 4: Define the parameter grid for hyperparameter tuning
param_grid = {
    'svm__C': [0.1, 1, 10],           # Regularization parameter for SVM
    'svm__kernel': ['linear', 'rbf'],  # SVM kernel types
    'nb__alpha': [0.1, 1.0, 10.0]      # Smoothing parameter for Naive Bayes
}

# Step 5: Apply Stratified K-Fold Cross Validation
k = 4  # Number of folds
skf = StratifiedKFold(n_splits=k)

# Step 6: Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=ensemble_model, param_grid=param_grid, 
    cv=skf, scoring='accuracy', n_jobs=-1, verbose=1
)

# Perform the grid search
grid_search.fit(X, y)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Step 7: Train the final model with the best parameters on the entire dataset
best_model = grid_search.best_estimator_
best_model.fit(X, y)

# Step 8: Save the fine-tuned ensemble model and vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(best_model, 'fine_tuned_ensemble_model.pkl')

print("Fine-tuned ensemble model and vectorizer saved successfully.")

Fitting 4 folds for each of 18 candidates, totalling 72 fits
Best Parameters: {'nb__alpha': 1.0, 'svm__C': 1, 'svm__kernel': 'linear'}
Best Cross-Validation Accuracy: 0.9385
Fine-tuned ensemble model and vectorizer saved successfully.


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
import joblib
from tqdm import tqdm  # Progress bar for long operations

# Step 1: Extract features using TF-IDF with progress tracking
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Using both unigrams and bigrams
    max_features=5000,  # Limit the number of features to prevent overfitting
    stop_words='english'  # Remove common stopwords
)

# Use tqdm to track the transformation progress
X = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
y = df['label']  # Labels
groups = df['ip']  # Grouping by IP to avoid data leakage

# Step 2: Initialize individual models
print("Initializing models...")
svm_model = SVC(probability=True, class_weight='balanced')  # Handle class imbalance
nb_model = MultinomialNB()
log_reg_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Step 3: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model), ('lr', log_reg_model)],
    voting='soft'  # Use soft voting to average probabilities
)

# Step 4: Define the parameter grid for hyperparameter tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 50],           # Regularization parameter for SVM
    'svm__kernel': ['linear', 'rbf'],      # SVM kernel types
    'nb__alpha': [0.1, 1.0, 10.0],         # Smoothing parameter for Naive Bayes
    'lr__C': [0.01, 0.1, 1, 10],           # Regularization parameter for Logistic Regression
}

# Step 5: Apply GroupKFold Cross Validation
gkf = GroupKFold(n_splits=4)  # GroupKFold ensures no data leakage

# Step 6: Use RandomizedSearchCV for hyperparameter tuning with progress tracking
print("Performing hyperparameter tuning using RandomizedSearchCV...")
random_search = RandomizedSearchCV(
    estimator=ensemble_model, param_distributions=param_grid, 
    n_iter=20, cv=gkf.split(X, y, groups=groups), scoring='accuracy', n_jobs=-1, verbose=1, random_state=42
)

# Use tqdm to track the fitting process
with tqdm(total=1, desc="Training and hyperparameter tuning") as pbar:
    random_search.fit(X, y)
    pbar.update(1)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search.best_score_:.4f}")

# Step 7: Train the final model with the best parameters on the entire dataset
print("Training the final model...")
best_model = random_search.best_estimator_

with tqdm(total=1, desc="Fitting final model") as pbar:
    best_model.fit(X, y)
    pbar.update(1)

# Step 8: Save the fine-tuned ensemble model and vectorizer
print("Saving the model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(best_model, 'fine_tuned_ensemble_model.pkl')

print("Fine-tuned ensemble model and vectorizer saved successfully.")


Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 3777/3777 [00:00<00:00, 117341.22it/s]


Initializing models...
Performing hyperparameter tuning using RandomizedSearchCV...


Training and hyperparameter tuning:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 4 folds for each of 20 candidates, totalling 80 fits


Training and hyperparameter tuning: 100%|██████████| 1/1 [00:44<00:00, 44.48s/it]


Best Parameters: {'svm__kernel': 'linear', 'svm__C': 50, 'nb__alpha': 0.1, 'lr__C': 0.1}
Best Cross-Validation Accuracy: 0.9983
Training the final model...


Fitting final model: 100%|██████████| 1/1 [00:02<00:00,  2.54s/it]

Saving the model and vectorizer...
Fine-tuned ensemble model and vectorizer saved successfully.





In [176]:
# Add noise by replacing some characters in the logs
import random

def add_noise(log):
    return ''.join([char if random.random() > 0.1 else '*' for char in log])

df_noisy = df.copy()
df_noisy['logs'] = df_noisy['logs'].apply(add_noise)

X_noisy = vectorizer.transform(df_noisy['logs'])
y_noisy = df_noisy['label']
y_pred_noisy = best_model.predict(X_noisy)

print(f"Accuracy with noisy data: {accuracy_score(y_noisy, y_pred_noisy):.4f}")


Accuracy with noisy data: 0.9190
