In [1]:
import re
import pandas as pd
from datetime import datetime

In [3]:
# File path
log_path = '../data/raw_logs/promjetDec2021.log'
log_path

'../data/raw_logs/promjetDec2021.log'

In [4]:
# Regular expression for Combined Log Format
log_pattern = re.compile(
    r'(?P<ip>\S+) - - \[(?P<timestamp>.*?)\] "(?P<method>\S+) (?P<url>\S+) \S+" '
    r'(?P<status>\d{3}) (?P<bytes>\S+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
)

In [5]:
# Load and parse log lines
records = []
with open(log_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        match = log_pattern.match(line)
        if match:
            data = match.groupdict()

            # Handle "-" bytes field
            if data['bytes'] == '-':
                data['bytes'] = 0
            else:
                data['bytes'] = int(data['bytes'])

            # Parse timestamp to datetime
            try:
                data['timestamp'] = datetime.strptime(data['timestamp'], "%d/%b/%Y:%H:%M:%S %z")
            except Exception:
                continue

            records.append(data)

In [6]:
# Create DataFrame
df = pd.DataFrame(records)

In [7]:
# Add extra time-based features
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['weekday'] = df['timestamp'].dt.weekday

In [8]:
# Request length (URL length)
df['url_length'] = df['url'].apply(len)

In [9]:
df.head()

Unnamed: 0,ip,timestamp,method,url,status,bytes,referrer,user_agent,hour,day,weekday,url_length
0,63.143.42.249,2021-11-30 15:08:14+03:00,GET,/,200,18648,http://promjet.ru,Mozilla/5.0+(compatible; UptimeRobot/2.0; http...,15,30,1,1
1,185.103.167.218,2021-11-30 15:10:19+03:00,GET,/jet/company/875.html,404,70695,-,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,15,30,1,21
2,63.143.42.247,2021-11-30 15:10:34+03:00,HEAD,/,200,0,http://promjet.ru,Mozilla/5.0+(compatible; UptimeRobot/2.0; http...,15,30,1,1
3,93.84.69.87,2021-11-30 15:12:47+03:00,GET,/favicon.ico,200,0,-,Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20...,15,30,1,12
4,93.84.69.87,2021-11-30 15:12:49+03:00,GET,/favicon.ico,200,0,-,Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20...,15,30,1,12


In [11]:
# Save parsed logs to CSV for further use
df.to_csv('../data/processed/parsed_logs.csv', index=False)