In [None]:
import nltk
import pandas as pd
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from nltk.corpus import stopwords

# Download required NLTK resources
nltk.download('stopwords')

# Sample log data (in a real scenario, this would come from log files)
log_data = [
    "User login failed due to invalid credentials",
    "System rebooted at 12:15 AM",
    "Error: Database connection timeout",
    "User admin login successful",
    "Error: Insufficient disk space on server",
    "Unauthorized access attempt detected",
    "User login from unknown IP address",
    "Server memory usage exceeded threshold",
    "System rebooted at 03:30 AM",
    "Database connection successful",
    "User login failed multiple times",
    "Suspicious activity detected: Multiple failed login attempts",
    "Server restarted unexpectedly"
]

# Preprocess the log data: tokenization, removing stop words, and cleaning the text
def preprocess_logs(log_data):
    stop_words = set(stopwords.words('english'))
    cleaned_logs = []
    
    for log in log_data:
        # Remove non-alphanumeric characters and convert to lowercase
        log = re.sub(r'[^A-Za-z0-9\s]', '', log.lower())
        # Tokenization and remove stop words
        tokens = nltk.word_tokenize(log)
        tokens = [word for word in tokens if word not in stop_words]
        cleaned_logs.append(' '.join(tokens))
    
    return cleaned_logs

# Extract features using TF-IDF Vectorization
def extract_features(cleaned_logs):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cleaned_logs)
    return X

# Anomaly detection using Isolation Forest
def detect_anomalies(X):
    model = IsolationForest(contamination=0.2)  # Contamination set to 20% (adjustable)
    model.fit(X)
    return model.predict(X)

# Flagging anomalous logs
def flag_anomalous_logs(log_data, predictions):
    flagged_logs = []
    for i, pred in enumerate(predictions):
        if pred == -1:  # Anomaly detected (Isolation Forest predicts -1 for outliers)
            flagged_logs.append(log_data[i])
    return flagged_logs

# Main function to process logs, extract features, and detect anomalies
def main():
    # Preprocess the logs
    cleaned_logs = preprocess_logs(log_data)
    
    # Extract features using TF-IDF
    X = extract_features(cleaned_logs)
    
    # Detect anomalies
    predictions = detect_anomalies(X)
    
    # Flag and display anomalous logs
    anomalous_logs = flag_anomalous_logs(log_data, predictions)
    
    if anomalous_logs:
        print("Anomalous logs detected:")
        for log in anomalous_logs:
            print(f"- {log}")
    else:
        print("No anomalous logs detected.")

# Run the log analyzer
if __name__ == "__main__":
    main()