Preprocessing Without Extraction 

In [1]:
pip install chardet

Collecting chardet
  Using cached chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Using cached chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import re
import os

# Define the path for input and output files
input_file = r'E:\Sanskar\Fourth Year\Final Project\FyI Project\Preprocessing\Log_Extraction\Extracted_Log_Files\SystemLog_2025-02-14_00-36-47.csv'  # Replace with the path to your log file
output_folder = r'E:\Sanskar\Fourth Year\Final Project\FyI Project\Preprocessing\Log_Extraction\Preprocessed Log Files'  # Replace with the path to your desired output folder
output_file = os.path.join(output_folder, 'modified_log_file.csv')

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Read the CSV log file
df = pd.read_csv(input_file)

# Clean column names by stripping leading/trailing spaces
df.columns = df.columns.str.strip()

# Print column names to check if 'Level' exists
print("Columns in the CSV file:", df.columns)

# Drop the 'Task Category' column if it exists
if 'Task Category' in df.columns:
    df.drop(columns=['Task Category'], inplace=True)

# Check if 'Level' exists before applying the filter
if 'Level' in df.columns:
    df = df[~df['Level'].isin(['Information', 'Warning'])]
else:
    print("'Level' column not found in the CSV file.")

# Define keywords and create a regex pattern
keywords = [
    'success', 'successful', 'status success', 'operation successful',
    'operation success', 'completed successfully', 'successfully completed',
    'task success', 'process success', 'execution success', 
    'execution successful', 'transaction success'
]
pattern = '|'.join(keywords)

# Remove rows where 'Level' is "Information" and 'Message' contains keywords
if 'Level' in df.columns:  # Only do this if 'Level' exists
    df = df[~((df['Level'] == 'Information') & 
              df['Message'].str.contains(pattern, case=False, na=False))]

# Replace special characters in 'Message'
df['Message'] = df['Message'].apply(
    lambda x: re.sub(r'[^\x00-\x7F]+', ' ', str(x))  # Remove non-ASCII characters
)
df['Message'] = df['Message'].apply(
    lambda x: re.sub(r'[:;\'-_"<>./?\\|]', ' ', str(x))  # Remove special characters
)

# Normalize whitespace
df['Message'] = df['Message'].apply(
    lambda x: re.sub(r'\s+', ' ', str(x)).strip()  # Replace multiple spaces with single space
)

# Remove duplicate messages while keeping the first occurrence
df.drop_duplicates(subset=['Message'], keep='first', inplace=True)

# Add a 'Threshold' column based on the 'Level' values
if 'Level' in df.columns:
    df['Threshold'] = df['Level'].apply(
        lambda x: 0.8 if x == 'Error' else (0.6 if x == 'Critical' else None)
    )

# Save the processed dataframe to the output CSV file
df.to_csv(output_file, index=False)

# Print the path of the saved output file
print(f"Processed CSV file saved to: {output_file}")


Columns in the CSV file: Index(['Level', 'Date and Time', 'Source', 'Event ID', 'Task Category',
       'Message'],
      dtype='object')
Processed CSV file saved to: E:\Sanskar\Fourth Year\Final Project\FyI Project\Preprocessing\Log_Extraction\Preprocessed Log Files\modified_log_file.csv


With Extraction

In [None]:
import pandas as pd
import re
import os
import time
from datetime import datetime

# Define the folders for input and output files
log_folder = r'E:\Sanskar\Fourth Year\Final Project\FyI Project\Preprocessing\Log_Extraction\Extracted_Log_Files'  # Log extraction folder
output_folder = r'E:\Sanskar\Fourth Year\Final Project\FyI Project\Preprocessing\Log_Extraction\Preprocessed Log Files'  # Output folder

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

while True:
    # Get the most recent log file generated by the first script
    files = [f for f in os.listdir(log_folder) if f.startswith("SystemLog")]
    if not files:
        print("No log files found.")
        break

    # Sort files by modification time and pick the most recent one
    latest_file = max(files, key=lambda f: os.path.getmtime(os.path.join(log_folder, f)))
    input_file = os.path.join(log_folder, latest_file)
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_file = os.path.join(output_folder, f"Modified_{timestamp}.csv")

    # Read the CSV log file
    df = pd.read_csv(input_file)

    # Clean column names by stripping leading/trailing spaces
    df.columns = df.columns.str.strip()

    # Drop the 'Task Category' column if it exists
    if 'Task Category' in df.columns:
        df.drop(columns=['Task Category'], inplace=True)

    # Check if 'Level' exists before applying the filter
    if 'Level' in df.columns:
        df = df[~df['Level'].isin(['Information', 'Warning'])]
    else:
        print("'Level' column not found in the CSV file.")

    # Define keywords and create a regex pattern
    keywords = [
        'success', 'successful', 'status success', 'operation successful',
        'operation success', 'completed successfully', 'successfully completed',
        'task success', 'process success', 'execution success', 
        'execution successful', 'transaction success'
    ]
    pattern = '|'.join(keywords)

    # Remove rows where 'Level' is "Information" and 'Message' contains keywords
    if 'Level' in df.columns:  # Only do this if 'Level' exists
        df = df[~((df['Level'] == 'Information') & 
                  df['Message'].str.contains(pattern, case=False, na=False))]

    # Replace special characters in 'Message'
    df['Message'] = df['Message'].apply(
        lambda x: re.sub(r'[^\x00-\x7F]+', ' ', str(x))  # Remove non-ASCII characters
    )
    df['Message'] = df['Message'].apply(
        lambda x: re.sub(r'[:;\'-_"<>./?\\|]', ' ', str(x))  # Remove special characters
    )

    # Normalize whitespace
    df['Message'] = df['Message'].apply(
        lambda x: re.sub(r'\s+', ' ', str(x)).strip()  # Replace multiple spaces with single space
    )

    # Remove duplicate messages while keeping the first occurrence
    df.drop_duplicates(subset=['Message'], keep='first', inplace=True)

    # Add a 'Threshold' column based on the 'Level' values
    if 'Level' in df.columns:
        df['Threshold'] = df['Level'].apply(
            lambda x: 0.8 if x == 'Error' else (0.6 if x == 'Critical' else None)
        )

    # Save the processed dataframe to the output CSV file
    df.to_csv(output_file, index=False)

    # Print the path of the saved output file
    print(f"Processed CSV file saved to: {output_file}")

    # Wait for 10 minutes before processing the next file
    time.sleep(600)  # 600 seconds = 10 minutes


Processed CSV file saved to: E:\Sanskar\Fourth Year\Final Project\FyI Project\Preprocessing\Log_Extraction\Preprocessed Log Files\Modified_2025-02-14_01-10-48.csv
