In [1]:
import pandas as pd
import re
import os
from datetime import datetime

class LogTransform:
    def __init__(self, source_path):
        """
        Initialize the LogTransform with source paths.

        :param source_path: Path to the raw log file
        """
        self.source_path = source_path
        self.log_pattern = None

    def read_log_file(self, regex_exp):
        """
        Read and parse the log file based on the defined pattern.

        :param regex_exp: Regex expression of logs
        :return: List of parsed log entries as dictionaries
        """
        # Define the log pattern for parsing
        self.log_pattern = regex_exp

        with open(self.source_path, 'r') as file:
            log_lines = file.readlines()
            parsed_logs = [
                re.match(self.log_pattern, line).groupdict()
                for line in log_lines if re.match(self.log_pattern, line)
            ]
        print(f"Parsed {len(parsed_logs)} log entries.")
        return pd.DataFrame(parsed_logs)

    def transform(self, parsed_logs):
        """
        Transform parsed logs into a structured DataFrame and preprocess the data.

        :param parsed_logs: List of parsed log entries as dictionaries
        :return: Preprocessed DataFrame
        """
        df = parsed_logs
        df['datetime'] = df['datetime'].apply(lambda x: datetime.strptime(x, "%d/%b/%Y:%H:%M:%S %z"))
        df['method'] = df['method'].replace({'GET': 0, 'POST': 1})
        df['bytes_sent'] = pd.to_numeric(df['bytes_sent'], errors='coerce')
        df['status'] = pd.to_numeric(df['status'], errors='coerce')
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
        df['month'] = df['datetime'].dt.month
        return df


# Example usage in Colab
if __name__ == "__main__":
    # Upload your log file to Colab
    log_files = [file for file in os.listdir(r'/Users/phamthiphuongthuy/Desktop/Intern/server_log/log/kong_log') if file.endswith('000Z')]
    dir_path = r'/Users/phamthiphuongthuy/Desktop/Intern/server_log/log/kong_log'
    export_path = r'/Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data'
    for filename in log_files:
      print(filename)
      source_path = os.path.join(dir_path, filename)  # This will prompt you to upload your log file

      destination_path = os.path.join(export_path, filename)

      # Define the regex pattern
      log_pattern = r'^(?P<remote_address>\d+\.\d+\.\d+\.\d+) - - \[(?P<datetime>[^\]]+)\] \"(?P<method>\w+) (?P<path>[^\s]+) (?P<header>[^\"]+)\" (?P<status>\d+) (?P<bytes_sent>\d+) \"(?P<referer>[^\"]*)\" \"(?P<user_agent>[^\"]*)\" kong_request_id: \"(?P<kong_request_id>[a-f0-9]+)\"'

      # Instantiate and process logs
      logtransform = LogTransform(source_path)
      df = logtransform.read_log_file(log_pattern)
      df = logtransform.transform(df)

      # Save the parsed logs to a CSV file
      df.to_csv(destination_path, index=False)
      print(f"Parsed logs saved to {destination_path}")

kong-logs-acesss.2024-12-15T15_00_00.000Z-2024-12-15T15_15_00.000Z
Parsed 1950281 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T15_00_00.000Z-2024-12-15T15_15_00.000Z
kong-logs-acesss.2024-12-15T12_30_00.000Z-2024-12-15T12_45_00.000Z
Parsed 1586962 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T12_30_00.000Z-2024-12-15T12_45_00.000Z
kong-logs-acesss.2024-12-15T14_45_00.000Z-2024-12-15T15_00_00.000Z
Parsed 2140274 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T14_45_00.000Z-2024-12-15T15_00_00.000Z
kong-logs-acesss.2024-12-15T12_15_00.000Z-2024-12-15T12_30_00.000Z
Parsed 1693991 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T12_15_00.000Z-2024-12-15T12_30_00.000Z
kong-logs-acesss.2024-12

  df['method'] = df['method'].replace({'GET': 0, 'POST': 1})


Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T12_00_00.000Z-2024-12-15T12_15_00.000Z
kong-logs-acesss.2024-12-15T16_00_00.000Z-2024-12-15T16_15_00.000Z
Parsed 1796996 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T16_00_00.000Z-2024-12-15T16_15_00.000Z
kong-logs-acesss.2024-12-15T00_00_00.000Z-2024-12-15T00_15_00.000Z
Parsed 1952084 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T00_00_00.000Z-2024-12-15T00_15_00.000Z
kong-logs-acesss.2024-12-15T01_00_00.000Z-2024-12-15T01_15_00.000Z
Parsed 1915265 log entries.
Parsed logs saved to /Users/phamthiphuongthuy/Desktop/Intern/server_log/test_data/kong-logs-acesss.2024-12-15T01_00_00.000Z-2024-12-15T01_15_00.000Z
kong-logs-acesss.2024-12-15T17_00_00.000Z-2024-12-15T17_15_00.000Z
Parsed 1714790 log entries.
Parsed logs saved to /Us