In [None]:
import re
import pandas as pd
from datetime import datetime

# Initialize a list to store the structured logs
list_of_log = []

# Open and read the log file
log_file_path = "augmented_train_logs.log"
with open(log_file_path, 'r') as f:
    logs = f.readlines()

# Process each log entry
for log in logs:
    try:
        # Split the log into components
        log_parts = log.split()
        
        # Extract the date, time, and log level (e.g., [notice] or [error])
        date = log_parts[0]
        time = log_parts[1]
        log_level = log_parts[2]
        
        # Check if the log follows the format with '#1:' or with a different pattern
        if "#1:" in log:
            event_number = log.split("#1:")[0].split()[3]  # Extract event number
            message = log.split("#1:")[1].strip()  # The actual log message
        else:
            # Handle cases where there is no '#1:' in the log
            event_number = log_parts[3] if len(log_parts) > 3 else "N/A"
            message = ' '.join(log_parts[4:]) if len(log_parts) > 4 else "N/A"
        
        # Rebuild the structured log entry
        structured_log = f'"{date} {time} {log_level} {event_number} #1: {message}",'
        list_of_log.append(structured_log)
    
    except IndexError:
        # Handle cases where the log format is unexpected
        print(f"Skipping malformed log entry: {log}")
        continue

# Convert the list of structured logs into the desired format
for log in list_of_log:
    print(log)  # Printing each structured log in the format you want



In [None]:
import re
import pandas as pd
from datetime import datetime

# Define a function to parse each log line
def parse_log_line(line):
    pattern = (
        r'(?P<ip>[\d.]+) - - \[(?P<timestamp>[^\]]+)\] '
        r'"(?P<method>[A-Z]+) (?P<url>[^ ]+) HTTP/[^"]+" '
        r'(?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>[^"]*)" "(?P<user_agent>[^"]+)"'
    )
    match = re.match(pattern, line)
    if match:
        return match.groupdict()  # Return matched groups as a dictionary
    return None

# Convert timestamp string to datetime
def parse_timestamp(ts):
    try:
        return datetime.strptime(ts, '%d/%b/%Y:%H:%M:%S %z')
    except ValueError:
        return None  # Handle incorrect timestamp format

# Ensure logs are available before processing
if 'logs' in globals() and logs:
    # Process logs and filter out None values after parsing
    log_entries = [parse_log_line(line) for line in logs]
    log_entries = [entry for entry in log_entries if entry is not None]

    # Create a DataFrame from the parsed log entries
    df = pd.DataFrame(log_entries)

    if not df.empty:
        # Convert 'status' to integer and 'timestamp' to datetime
        df['status'] = df['status'].astype(int)
        df['timestamp'] = df['timestamp'].apply(parse_timestamp)

        # Filter out rows with invalid timestamps
        df = df[df['timestamp'].notna()]

        # Detect brute-force attempts: Same IP posts '/vulnerable_login' >10 times in the same minute
        df['time_window'] = df['timestamp'].dt.floor('T')  # Round down to the minute
        brute_force_mask = df.groupby(['ip', 'time_window'])['url'].transform(
            lambda x: (x == '/vulnerable_login').sum() > 10
        )

        # Drop brute-force attempts using the mask
        df = df[~brute_force_mask]

        # Flag suspicious status codes (500, 403, 404, 405)
        df['suspicious_status'] = df['status'].isin([500, 403, 404, 405])

        # Flag bypass attempts: Missing referrer or accessing /profile?user_id=3
        df['bypass'] = (df['referrer'] == '-') & df['url'].str.contains(r'/profile\?user_id=\d')

        # Combine relevant fields into a structured 'logs' column
        df['logs'] = (
            df['ip'] + ' ' + df['method'] + ' ' + df['url'] + ' ' +
            df['status'].astype(str) + ' ' + df['size'].astype(str) + ' ' + df['referrer']
        )

        # Add labels: 2 -> bypass, 1 -> suspicious_status, 0 -> normal
        df['label'] = df.apply(
            lambda row: 2 if row['bypass'] 
            else 1 if row['suspicious_status'] 
            else 0, axis=1
        )

        # Select only the relevant columns for output
        output_df = df[['logs', 'label']]

        # Display the final DataFrame
        print(output_df)

        # Save the DataFrame to a CSV file
        output_df.to_csv('log_analysis_output.csv', index=False)

    else:
        print("No valid log entries found to process.")

else:
    print("Log data is empty or not defined.")


In [57]:
import re
import pandas as pd
from datetime import datetime

# Configure pandas to display all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust width for better readability
pd.set_option('display.colheader_justify', 'center')  # Center-align column headers

# Define a function to parse each log line
def parse_log_line(line):
    pattern = (
        r'(?P<ip>[\d.]+) - - \[(?P<timestamp>[^\]]+)\] '
        r'"(?P<method>[A-Z]+) (?P<url>[^ ]+) HTTP/[^"]+" '
        r'(?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>[^"]*)" "(?P<user_agent>[^"]+)"'
    )
    match = re.match(pattern, line)
    if match:
        return match.groupdict()  # Return matched groups as a dictionary
    return None

# Convert timestamp string to datetime
def parse_timestamp(ts):
    try:
        return datetime.strptime(ts, '%d/%b/%Y:%H:%M:%S %z')
    except ValueError:
        return None  # Handle incorrect timestamp format

# Initialize a dictionary to track login attempts for each IP
attempt_count = {}

# Ensure logs are available before processing
if 'logs' in globals() and logs:
    # Process logs and filter out None values after parsing
    log_entries = [parse_log_line(line) for line in logs]
    log_entries = [entry for entry in log_entries if entry is not None]

    # Create a DataFrame from the parsed log entries
    df = pd.DataFrame(log_entries)

    if not df.empty:
        # Convert 'status' to integer and 'timestamp' to datetime
        df['status'] = df['status'].astype(int)
        df['timestamp'] = df['timestamp'].apply(parse_timestamp)

        # Filter out rows with invalid timestamps
        df = df[df['timestamp'].notna()]

        # Initialize a list to store logs with counts and labels
        labeled_logs = []

        # Process each log entry line by line
        for index, row in df.iterrows():
            ip = row['ip']
            url = row['url']
            status = row['status']

            # Increment attempt count only for vulnerable login failures
            if url == '/vulnerable_login' and status == 200:
                if ip not in attempt_count:
                    attempt_count[ip] = 1
                else:
                    attempt_count[ip] += 1
            else:
                attempt_count[ip] = 0  # Reset on other URLs/status

            # Determine the label
            if attempt_count[ip] >= 10:
                label = 3  # Brute force attack
            elif status in [500, 403, 404, 405]:
                label = 1  # Suspicious status
            elif row['referrer'] == '-' and '/profile' in url:
                label = 2  # Bypass attempt
            else:
                label = 0  # Normal

            # Create the structured log entry
            log_entry = (
                f"{row['ip']} {row['method']} {row['url']} "
                f"{row['status']} {row['size']} {row['referrer']}"
            )

            # Store the log entry with its count and label
            labeled_logs.append([log_entry, attempt_count[ip], label])

        # Create a DataFrame from the labeled logs with appropriate column names
        output_df = pd.DataFrame(labeled_logs, columns=['logs', 'count', 'label'])

        # Display the final DataFrame
        print(output_df)

        # Save the DataFrame to a CSV file with column headers
        output_df.to_csv('log_analysis_output2.csv', index=False)

    else:
        print("No valid log entries found to process.")
else:
    print("Log data is empty or not defined.")


                            logs                         count  label
0                      1.106.238.2 GET /terms 404 779 -    0      1  
1                          1.107.70.223 GET / 200 702 -    0      0  
2     1.115.116.71 GET /comments 200 86 http://local...    0      0  
3                        1.126.65.28 DELETE / 403 620 -    0      1  
4     1.148.81.249 DELETE / 200 623 http://localhost...    0      0  
5     1.215.137.168 POST /vulnerable_login 200 96 ht...    1      0  
6     1.218.223.53 GET /profile?user_id=1 301 487 ht...    0      0  
7                           1.51.180.44 GET / 403 573 -    0      1  
8         1.62.221.156 GET /profile?user_id=3 301 483 -    0      2  
9     1.64.230.248 GET /profile?user_id=11 500 510 h...    0      1  
10                         1.68.40.69 PATCH / 200 666 -    0      0  
11    10.132.6.193 POST /comments 200 109 http://loc...    0      0  
12       10.143.129.10 GET /profile?user_id=2 403 481 -    0      1  
13    10.164.235.146

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC

# Assuming df has already been created from previous steps with 'logs' and 'label' columns

# Step 1: Extract features using TF-IDF
vectorizer = TfidfVectorizer()  # Initialize TF-IDF vectorizer
X = vectorizer.fit_transform(df['logs'])  # Transform the 'logs' column into numerical features
y = df['label']  # Labels (targets)

# Step 2: Apply Stratified K-Fold Cross Validation
k = 4  # Number of folds
svm_model = SVC(kernel='linear', C=10)  # Initialize the SVM model with a linear kernel

# Initialize StratifiedKFold to maintain class distribution across folds
skf = StratifiedKFold(n_splits=k)

# Cross-validation
cv_scores = cross_val_score(svm_model, X, y, cv=skf, scoring='accuracy')

# Print cross-validation results
print(f"Cross-Validation Accuracy Scores for {k} folds: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.4f}")

# Step 3: Train the final model on the entire dataset
svm_model.fit(X, y)

# Optionally, you can save the trained model
import joblib
joblib.dump(vectorizer, 'vectorizer1.pkl')
joblib.dump(svm_model, 'svm_log_classifier_model_with_kfold1.pkl')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
import joblib

# Step 1: Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['logs'])  # Transform 'logs' column to numerical features
y = df['label']  # Labels

# Step 2: Initialize individual models
svm_model = SVC(kernel='linear', C=10, probability=True)  # SVM with probability output
nb_model = MultinomialNB()  # Naive Bayes model

# Step 3: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model)],  # List of models
    voting='soft'  # Use soft voting to average probabilities
)

# Step 4: Apply Stratified K-Fold Cross Validation
k = 4  # Number of folds
skf = StratifiedKFold(n_splits=k)

# Cross-validation
cv_scores = cross_val_score(ensemble_model, X, y, cv=skf, scoring='accuracy')

# Print cross-validation results
print(f"VotingClassifier Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.4f}")

# Step 5: Train the final ensemble model on the entire dataset
ensemble_model.fit(X, y)

# Step 6: Save the ensemble model and vectorizer -> เอา text ไปทำให้มีค่าทางคณิตศาสตร์
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(ensemble_model, 'ensemble_log_classifier_model.pkl')

print("Ensemble model and vectorizer saved successfully.")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import joblib

# Step 1: Extract features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using both unigrams and bigrams
X = vectorizer.fit_transform(df['logs'])  # Transform 'logs' column to numerical features
y = df['label']  # Labels

# Step 2: Initialize individual models
svm_model = SVC(probability=True)  # SVM with probability output
nb_model = MultinomialNB()  # Naive Bayes model

# Step 3: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model)],
    voting='soft'  # Use soft voting to average probabilities
)

# Step 4: Define the parameter grid for hyperparameter tuning
param_grid = {
    'svm__C': [0.1, 1, 10],           # Regularization parameter for SVM
    'svm__kernel': ['linear', 'rbf'],  # SVM kernel types
    'nb__alpha': [0.1, 1.0, 10.0]      # Smoothing parameter for Naive Bayes
}

# Step 5: Apply Stratified K-Fold Cross Validation
k = 4  # Number of folds
skf = StratifiedKFold(n_splits=k)

# Step 6: Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=ensemble_model, param_grid=param_grid, 
    cv=skf, scoring='accuracy', n_jobs=-1, verbose=1
)

# Perform the grid search
grid_search.fit(X, y)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Step 7: Train the final model with the best parameters on the entire dataset
best_model = grid_search.best_estimator_
best_model.fit(X, y)

# Step 8: Save the fine-tuned ensemble model and vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(best_model, 'fine_tuned_ensemble_model.pkl')

print("Fine-tuned ensemble model and vectorizer saved successfully.")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
import joblib
from tqdm import tqdm  # Progress bar for long operations

# Step 1: Extract features using TF-IDF with progress tracking
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Using both unigrams and bigrams
    max_features=5000,  # Limit the number of features to prevent overfitting
    stop_words='english'  # Remove common stopwords
)

# Use tqdm to track the transformation progress
X = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
y = df['label']  # Labels
groups = df['ip']  # Grouping by IP to avoid data leakage

# Step 2: Initialize individual models
print("Initializing models...")
svm_model = SVC(probability=True, class_weight='balanced')  # Handle class imbalance
nb_model = MultinomialNB()
log_reg_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Step 3: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model), ('lr', log_reg_model)],
    voting='soft'  # Use soft voting to average probabilities
)

# Step 4: Define the parameter grid for hyperparameter tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 50],           # Regularization parameter for SVM
    'svm__kernel': ['linear', 'rbf'],      # SVM kernel types
    'nb__alpha': [0.1, 1.0, 10.0],         # Smoothing parameter for Naive Bayes
    'lr__C': [0.01, 0.1, 1, 10],           # Regularization parameter for Logistic Regression
}

# Step 5: Apply GroupKFold Cross Validation
gkf = GroupKFold(n_splits=4)  # GroupKFold ensures no data leakage

# Step 6: Use RandomizedSearchCV for hyperparameter tuning with progress tracking
print("Performing hyperparameter tuning using RandomizedSearchCV...")
random_search = RandomizedSearchCV(
    estimator=ensemble_model, param_distributions=param_grid, 
    n_iter=20, cv=gkf.split(X, y, groups=groups), scoring='accuracy', n_jobs=-1, verbose=1, random_state=42
)

# Use tqdm to track the fitting process
with tqdm(total=1, desc="Training and hyperparameter tuning") as pbar:
    random_search.fit(X, y)
    pbar.update(1)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search.best_score_:.4f}")

# Step 7: Train the final model with the best parameters on the entire dataset
print("Training the final model...")
best_model = random_search.best_estimator_

with tqdm(total=1, desc="Fitting final model") as pbar:
    best_model.fit(X, y)
    pbar.update(1)

# Step 8: Save the fine-tuned ensemble model and vectorizer
print("Saving the model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(best_model, 'fine_tuned_ensemble_model.pkl')

print("Fine-tuned ensemble model and vectorizer saved successfully.")


In [71]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from scipy.sparse import hstack  # To combine sparse matrices
import joblib
from tqdm import tqdm  # For progress tracking
from sklearn.model_selection import RepeatedStratifiedKFold

# Step 1: Load the dataset
print("Loading and processing the logs...")
df = pd.read_csv('log_analysis_output2.csv')  # Ensure pre-processed logs are available

# No need to reassign the label based on count, since the count feature will be used in training

# Step 2: Extract features using TF-IDF
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Use unigrams and bigrams
    max_features=5000,  # Limit to prevent overfitting
    stop_words='english'  # Remove common stopwords
)

# Transform logs with TF-IDF and extract the count feature
X_text = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
X_count = df[['count']].values  # Extract count as a separate feature

# Combine TF-IDF features with the count feature
X = hstack((X_text, X_count))  # Combine both feature sets
y = df['label']  # Target labels

# Extract IPs for GroupKFold to avoid data leakage across groups
groups = df['logs'].str.extract(r'(\d+\.\d+\.\d+\.\d+)')[0]

# Step 3: Initialize individual models
print("Initializing models...")
svm_model = SVC(probability=True, class_weight='balanced')
nb_model = MultinomialNB()
log_reg_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Step 4: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model), ('lr', log_reg_model)],
    voting='soft'  # Use soft voting to average probabilities
)

# Step 5: Define the hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 50],  # SVM regularization
    'svm__kernel': ['linear', 'rbf'],  # SVM kernels
    'nb__alpha': [0.1, 1.0, 10.0],  # Naive Bayes smoothing
    'lr__C': [0.01, 0.1, 1, 10]  # Logistic Regression regularization
}

# Step 6: Apply GroupKFold for cross-validation
gkf = GroupKFold(n_splits=4)

# Step 7: Use RandomizedSearchCV for hyperparameter tuning
print("Performing hyperparameter tuning...")
random_search = RandomizedSearchCV(
    estimator=ensemble_model, param_distributions=param_grid,
    n_iter=20, cv=gkf.split(X, y, groups=groups), scoring='accuracy',
    n_jobs=-1, verbose=1, random_state=42
)

# Fit the model with progress tracking
with tqdm(total=1, desc="Training and hyperparameter tuning") as pbar:
    random_search.fit(X, y)
    pbar.update(1)

# Print the best parameters and score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search.best_score_:.4f}")

# Step 8: Train the final model with the best parameters
print("Training the final model...")
best_model = random_search.best_estimator_

with tqdm(total=1, desc="Fitting final model") as pbar:
    best_model.fit(X, y)
    pbar.update(1)

# Step 9: Save the model and vectorizer
print("Saving the model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer2.pkl')
joblib.dump(best_model, 'fine_tuned_ensemble_model2.pkl')

print("Model and vectorizer saved successfully.")


Loading and processing the logs...
Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 6350/6350 [00:00<00:00, 102346.48it/s]


Initializing models...
Performing hyperparameter tuning...


Training and hyperparameter tuning:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 4 folds for each of 20 candidates, totalling 80 fits


Training and hyperparameter tuning: 100%|██████████| 1/1 [02:05<00:00, 125.15s/it]


Best Parameters: {'svm__kernel': 'linear', 'svm__C': 50, 'nb__alpha': 10.0, 'lr__C': 10}
Best Cross-Validation Accuracy: 0.9980
Training the final model...


Fitting final model: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]

Saving the model and vectorizer...
Model and vectorizer saved successfully.





In [61]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack  # To combine sparse matrices
import joblib
from tqdm import tqdm  # For progress tracking

# Step 1: Load the dataset
print("Loading and processing the logs...")
df = pd.read_csv('log_analysis_output2.csv')  # Ensure pre-processed logs are available

# Step 2: Extract features using TF-IDF
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Use unigrams and bigrams
    max_features=5000,  # Limit to prevent overfitting
    stop_words='english'  # Remove common stopwords
)

# Transform logs with TF-IDF and extract the count feature
X_text = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
X_count = df[['count']].values  # Extract count as a separate feature

# Combine TF-IDF features with the count feature
X = hstack((X_text, X_count))  # Combine both feature sets
y = df['label']  # Target labels

# Step 3: Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Initialize individual models
print("Initializing models...")
svm_model = SVC(probability=True, class_weight='balanced')
nb_model = MultinomialNB()
log_reg_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Step 5: Create a VotingClassifier ensemble
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('nb', nb_model), ('lr', log_reg_model)],
    voting='soft'  # Use soft voting to average probabilities
)

# Step 6: Define the hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 50],  # SVM regularization
    'svm__kernel': ['linear', 'rbf'],  # SVM kernels
    'nb__alpha': [0.1, 1.0, 10.0],  # Naive Bayes smoothing
    'lr__C': [0.01, 0.1, 1, 10]  # Logistic Regression regularization
}

# Step 7: Use RandomizedSearchCV for hyperparameter tuning
print("Performing hyperparameter tuning...")
random_search = RandomizedSearchCV(
    estimator=ensemble_model, param_distributions=param_grid,
    n_iter=20, cv=3, scoring='accuracy',  # Perform simple 3-fold CV internally within train data
    n_jobs=-1, verbose=1, random_state=42
)

# Fit the model with progress tracking
with tqdm(total=1, desc="Training and hyperparameter tuning") as pbar:
    random_search.fit(X_train, y_train)
    pbar.update(1)

# Print the best parameters and score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Training Accuracy: {random_search.best_score_:.4f}")

# Step 8: Evaluate the final model on the test set
print("Evaluating the final model...")
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 9: Save the model and vectorizer
print("Saving the model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer_no_kfold.pkl')
joblib.dump(best_model, 'fine_tuned_ensemble_model_no_kfold.pkl')

print("Model and vectorizer saved successfully.")


Loading and processing the logs...
Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 6350/6350 [00:00<00:00, 90114.33it/s]


Splitting data into training and testing sets...
Initializing models...
Performing hyperparameter tuning...


Training and hyperparameter tuning:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Training and hyperparameter tuning: 100%|██████████| 1/1 [02:17<00:00, 137.35s/it]


Best Parameters: {'svm__kernel': 'rbf', 'svm__C': 10, 'nb__alpha': 10.0, 'lr__C': 1}
Best Training Accuracy: 0.9991
Evaluating the final model...
Test Accuracy: 0.9995
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1334
           1       1.00      1.00      1.00       346
           2       1.00      1.00      1.00       211
           3       1.00      1.00      1.00        14

    accuracy                           1.00      1905
   macro avg       1.00      1.00      1.00      1905
weighted avg       1.00      1.00      1.00      1905

Saving the model and vectorizer...
Model and vectorizer saved successfully.


In [60]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack  # To combine sparse matrices
import joblib
from tqdm import tqdm  # For progress tracking

# Step 1: Load the dataset
print("Loading and processing the logs...")
df = pd.read_csv('log_analysis_output2.csv')  # Ensure pre-processed logs are available

# Step 2: Extract features using TF-IDF
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 1),  # Only unigrams
    max_features=100,  # Use a smaller number of features
    stop_words='english'
)

# Transform logs with TF-IDF and extract the count feature
X_text = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
X_count = df[['count']].values  # Extract count as a separate feature

# Combine TF-IDF features with the count feature
X = hstack((X_text, X_count))  # Combine both feature sets
y = df['label']  # Target labels

# Step 3: Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Initialize and train the SVM model
print("Training SVM model...")
svm_model = SVC(kernel='linear', probability=False, class_weight='balanced')  # Simplified linear kernel
with tqdm(total=1, desc="Fitting SVM model") as pbar:
    svm_model.fit(X_train, y_train)
    pbar.update(1)

# Step 5: Evaluate the model
print("Evaluating the model...")
accuracy = svm_model.score(X_test, y_test)
print(f"Validation Accuracy: {accuracy:.4f}")

# Step 6: Save the model and vectorizer
print("Saving the SVM model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer_svm.pkl')
joblib.dump(svm_model, 'svm_model.pkl')

print("Simplified SVM model and vectorizer saved successfully.")


Loading and processing the logs...
Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 6350/6350 [00:00<00:00, 144248.91it/s]


Splitting data into training and testing sets...
Training SVM model...


Fitting SVM model: 100%|██████████| 1/1 [00:00<00:00,  4.76it/s]

Evaluating the model...
Validation Accuracy: 0.9927
Saving the SVM model and vectorizer...
Simplified SVM model and vectorizer saved successfully.





In [59]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from scipy.sparse import hstack  # To combine sparse matrices
import joblib
from tqdm import tqdm  # For progress tracking

# Step 1: Load the dataset
print("Loading and processing the logs...")
df = pd.read_csv('log_analysis_output2.csv')  # Ensure pre-processed logs are available

# Step 2: Extract features using TF-IDF
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 1),  # Only unigrams
    max_features=100,  # Use a smaller number of features
    stop_words='english'
)

# Transform logs with TF-IDF and extract the count feature
X_text = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
X_count = df[['count']].values  # Extract count as a separate feature

# Combine TF-IDF features with the count feature
X = hstack((X_text, X_count))  # Combine both feature sets
y = df['label']  # Target labels

# Step 3: Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Initialize and train the Naive Bayes model
print("Training Naive Bayes model...")
nb_model = MultinomialNB(alpha=1.0)  # Default smoothing parameter
with tqdm(total=1, desc="Fitting Naive Bayes model") as pbar:
    nb_model.fit(X_train, y_train)
    pbar.update(1)

# Step 5: Evaluate the model
print("Evaluating the model...")
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(conf_matrix, 
                   columns=['Pred_Normal', 'Pred_Suspicious', 'Pred_Bypass', 'Pred_Brute_Force'],
                   index=['True_Normal', 'True_Suspicious', 'True_Bypass', 'True_Brute_Force']))

# Step 6: Save the model and vectorizer
print("Saving the Naive Bayes model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer_nb.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')

print("Naive Bayes model and vectorizer saved successfully.")

Loading and processing the logs...
Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 6350/6350 [00:00<00:00, 55354.07it/s]

Splitting data into training and testing sets...





Training Naive Bayes model...


Fitting Naive Bayes model: 100%|██████████| 1/1 [00:00<00:00, 162.32it/s]

Evaluating the model...
Validation Accuracy: 0.8798
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1334
           1       0.99      0.97      0.98       346
           2       1.00      0.09      0.16       211
           3       0.36      1.00      0.53        14

    accuracy                           0.88      1905
   macro avg       0.80      0.76      0.65      1905
weighted avg       0.90      0.88      0.84      1905

Confusion Matrix:
                  Pred_Normal  Pred_Suspicious  Pred_Bypass  Pred_Brute_Force
True_Normal          1307              2             0              25       
True_Suspicious         9            337             0               0       
True_Bypass           190              3            18               0       
True_Brute_Force        0              0             0              14       
Saving the Naive Bayes model and vectorizer...
Naive Bayes model and vectorizer sa




In [64]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from scipy.sparse import hstack  # To combine sparse matrices
import joblib
from tqdm import tqdm  # For progress tracking

# Step 1: Load the dataset
print("Loading and processing the logs...")
df = pd.read_csv('log_analysis_output2.csv')  # Ensure pre-processed logs are available

# Step 2: Extract features using TF-IDF
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Only unigrams
    max_features=5000,  # Use a smaller number of features
    stop_words='english'
)

# Transform logs with TF-IDF and extract the count feature
X_text = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
X_count = df[['count']].values  # Extract count as a separate feature

# Combine TF-IDF features with the count feature
X = hstack((X_text, X_count))  # Combine both feature sets
y = df['label']  # Target labels

# Step 3: Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Initialize and train the Logistic Regression model
print("Training Logistic Regression model...")
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')  # Balanced class weights
with tqdm(total=1, desc="Fitting Logistic Regression model") as pbar:
    lr_model.fit(X_train, y_train)
    pbar.update(1)

# Step 5: Evaluate the model
print("Evaluating the model...")
y_pred = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(conf_matrix, 
                   columns=['Pred_Normal', 'Pred_Suspicious', 'Pred_Bypass', 'Pred_Brute_Force'],
                   index=['True_Normal', 'True_Suspicious', 'True_Bypass', 'True_Brute_Force']))

# Step 6: Save the model and vectorizer
print("Saving the Logistic Regression model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer_lr.pkl')
joblib.dump(lr_model, 'logistic_regression_model.pkl')

print("Logistic Regression model and vectorizer saved successfully.")


Loading and processing the logs...
Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 6350/6350 [00:00<00:00, 51602.15it/s]


Splitting data into training and testing sets...
Training Logistic Regression model...


Fitting Logistic Regression model: 100%|██████████| 1/1 [00:00<00:00,  4.11it/s]

Evaluating the model...
Validation Accuracy: 0.9932
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1334
           1       0.99      1.00      1.00       346
           2       0.99      1.00      0.99       211
           3       0.64      1.00      0.78        14

    accuracy                           0.99      1905
   macro avg       0.90      1.00      0.94      1905
weighted avg       0.99      0.99      0.99      1905

Confusion Matrix:
                  Pred_Normal  Pred_Suspicious  Pred_Bypass  Pred_Brute_Force
True_Normal          1321              2              3              8       
True_Suspicious         0            346              0              0       
True_Bypass             0              0            211              0       
True_Brute_Force        0              0              0             14       
Saving the Logistic Regression model and vectorizer...
Logistic Regression model a




In [67]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from scipy.sparse import hstack, csr_matrix
import numpy as np
from tqdm import tqdm
import joblib

# Step 1: Load the dataset
print("Loading and processing the logs...")
df = pd.read_csv('log_analysis_output2.csv')  # Ensure pre-processed logs are available

# Step 2: Extract features using TF-IDF
print("Extracting features using TF-IDF...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Unigrams and bigrams
    max_features=5000,  # Limit the number of features
    stop_words='english'  # Remove common stopwords
)

# Transform logs with TF-IDF and extract the count feature
X_text = vectorizer.fit_transform(tqdm(df['logs'], desc="Transforming logs"))
X_count = df[['count']].values  # Extract count as a separate feature
X = hstack((X_text, X_count)).tocsr()  # Combine and convert to CSR format
y = df['label']  # Target labels

# Step 3: Initialize the Random Forest model
print("Initializing Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)  # Balanced class weights

# Step 4: Perform K-Fold Cross-Validation
print("Performing K-Fold Cross-Validation...")
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores = []
conf_matrices = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    print(f"Fold {fold + 1}:")
    # Split the data
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Print results for the current fold
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix for this fold
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrices.append(conf_matrix)

# Step 5: Compute Average Accuracy and Confusion Matrix
average_accuracy = np.mean(accuracy_scores)
print(f"\nAverage Validation Accuracy across folds: {average_accuracy:.4f}")

# Combine confusion matrices
final_conf_matrix = np.sum(conf_matrices, axis=0)
print("\nFinal Confusion Matrix (aggregated over all folds):")
print(pd.DataFrame(final_conf_matrix, 
                   columns=['Pred_Normal', 'Pred_Suspicious', 'Pred_Bypass', 'Pred_Brute_Force'],
                   index=['True_Normal', 'True_Suspicious', 'True_Bypass', 'True_Brute_Force']))

# Step 6: Save the model and vectorizer
print("Saving the Random Forest model and vectorizer...")
joblib.dump(vectorizer, 'vectorizer_kfold_rf.pkl')
joblib.dump(rf_model, 'random_forest_kfold_model.pkl')

print("Random Forest model and vectorizer saved successfully.")


Loading and processing the logs...
Extracting features using TF-IDF...


Transforming logs: 100%|██████████| 6350/6350 [00:00<00:00, 70037.24it/s]


Initializing Random Forest model...
Performing K-Fold Cross-Validation...
Fold 1:
Fold 1 Accuracy: 0.9921
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       886
           1       1.00      0.99      1.00       249
           2       1.00      1.00      1.00       127
           3       0.50      0.12      0.20         8

    accuracy                           0.99      1270
   macro avg       0.87      0.78      0.80      1270
weighted avg       0.99      0.99      0.99      1270

Fold 2:
Fold 2 Accuracy: 0.9937
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       887
           1       1.00      1.00      1.00       248
           2       1.00      1.00      1.00       127
           3       0.67      0.25      0.36         8

    accuracy                           0.99      1270
   macro avg       0.91      0.81      0.84      1270
w