In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import re

# Read the CSV file with ISO-8859-1 encoding
file_path = r'C:\Users\nikha\OneDrive\Desktop\newtry\system.csv'
try:
    df = pd.read_csv(file_path, encoding='ISO-8859-1')
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
    exit()

# Rename the last column to 'Original Message'
df.columns.values[-1] = 'Original Message'

# Drop the 'Task Category' column if it exists
if 'Task Category' in df.columns:
    df = df.drop(columns=['Task Category'])
else:
    print("'Task Category' column not found. Skipping drop operation.")

# Drop rows with 'Level' as "Information" or "Warning"
if 'Level' in df.columns:
    df = df[~df['Level'].isin(['Information', 'Warning'])]
else:
    print("Error: 'Level' column not found. Skipping row filtering operation.")

# List of keywords related to success or status success
keywords = ['success', 'successful', 'status success', 'operation successful', 'operation success',
            'completed successfully', 'successfully completed', 'task success', 'process success', 
            'execution success', 'execution successful', 'transaction success']

# Create a regular expression pattern from the keywords
pattern = '|'.join(keywords)

# Remove rows where 'Level' is "Information" and 'Original Message' contains any of the keywords (case insensitive)
if 'Level' in df.columns and 'Original Message' in df.columns:
    df = df[~((df['Level'] == 'Information') & df['Original Message'].str.contains(pattern, case=False))]
else:
    print("Error: 'Level' or 'Original Message' column not found. Skipping row filtering operation.")

# Replace special characters like :,;'"<>./?\| with a single space in 'Original Message'
df['Original Message'] = df['Original Message'].apply(lambda x: re.sub(r'[:;\'-_"<>./?\\|]', ' ', str(x)))

# Remove duplicate messages while keeping the first occurrence
df = df.drop_duplicates(subset=['Original Message'], keep='first')

# Add a 'threshold' column based on the 'Level' values
if 'Level' in df.columns:
    df['Threshold'] = df['Level'].apply(lambda x: 0.8 if x == 'Error' else (0.6 if x == 'Critical' else None))
else:
    print("Error: 'Level' column not found. Skipping 'threshold' column creation.")

# Save the modified dataframe to a new CSV file
output_path = r'C:\Users\nikha\OneDrive\Desktop\newtry\modified_system.csv'
try:
    df.to_csv(output_path, index=False)
    print(f"The modified CSV file has been saved successfully to: {output_path}")
except Exception as e:
    print(f"Error while saving the file: {e}")

'Task Category' column not found. Skipping drop operation.
Error: 'Level' column not found. Skipping row filtering operation.
Error: 'Level' or 'Original Message' column not found. Skipping row filtering operation.
Error: 'Level' column not found. Skipping 'threshold' column creation.
The modified CSV file has been saved successfully to: C:\Users\nikha\OneDrive\Desktop\newtry\modified_system.csv


In [4]:
import os
import csv

# File path for the preprocessed log file
file_path = r"C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\Updated Methods\modified_system.csv"

# Desired chunk size in bytes (1 KB = 1024 bytes)
chunk_size = 1024  

# Output directory for chunks
output_folder = r"C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output"
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Open the input file and start splitting
with open(file_path, "r", encoding="utf-8") as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)  # Read the header row
    
    file_number = 1
    current_chunk_size = 0
    chunk_rows = []  # To store rows for the current chunk

    for row in reader:
        # Calculate the size of the current row
        row_size = sum(len(str(item).encode("utf-8")) for item in row) + len(row)  # Include commas
        
        # Check if adding this row exceeds the chunk size
        if current_chunk_size + row_size > chunk_size:
            # Write the current chunk to a file
            chunk_file_path = os.path.join(output_folder, f"log_chunk_{file_number}.csv")
            with open(chunk_file_path, "w", newline="", encoding="utf-8") as chunk_file:
                writer = csv.writer(chunk_file)
                writer.writerow(header)  # Write the header to maintain structure
                writer.writerows(chunk_rows)  # Write the rows of this chunk
            
            print(f"Chunk {file_number} saved as {chunk_file_path}")
            file_number += 1
            chunk_rows = []  # Reset rows for the new chunk
            current_chunk_size = 0
        
        # Add the current row to the chunk
        chunk_rows.append(row)
        current_chunk_size += row_size
    
    # Write the remaining rows as the last chunk
    if chunk_rows:
        chunk_file_path = os.path.join(output_folder, f"log_chunk_{file_number}.csv")
        with open(chunk_file_path, "w", newline="", encoding="utf-8") as chunk_file:
            writer = csv.writer(chunk_file)
            writer.writerow(header)  # Write the header
            writer.writerows(chunk_rows)  # Write the remaining rows
        
        print(f"Chunk {file_number} saved as {chunk_file_path}")


Chunk 1 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_1.csv
Chunk 2 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_2.csv
Chunk 3 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_3.csv
Chunk 4 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_4.csv
Chunk 5 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_5.csv
Chunk 6 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_6.csv
Chunk 7 saved as C:\Users\nikha\OneDrive\Desktop\FyI Project\Preprocessing\newtry\Chunk_Output\log_chunk_7.csv
