Creating a simple dataset.

In [1]:
%%writefile weblogs.txt
# Date, Time, IP, Method, URL, Status, ResponseSize
2025-10-10,12:01:32,192.168.1.2,GET,/index.html,200,1024
2025-10-10,12:01:33,192.168.1.3,GET,/products.html,200,850
2025-10-10,12:01:35,192.168.1.4,GET,/contact.html,404,512
2025-10-10,12:01:38,192.168.1.5,POST,/checkout,500,128
2025-10-10,12:01:41,192.168.1.6,GET,/index.html,200,1024
2025-10-10,12:01:45,192.168.1.7,GET,/images/logo.png,200,256
2025-10-10,12:01:48,192.168.1.8,GET,/about.html,404,512
2025-10-10,12:01:53,192.168.1.9,POST,/login,403,64
2025-10-10,12:02:01,192.168.1.10,GET,/index.html,200,1024
2025-10-10,12:02:07,192.168.1.11,POST,/checkout,500,128
2025-10-10,12:02:12,192.168.1.12,GET,/contact.html,404,512
2025-10-10,12:02:15,192.168.1.13,GET,/index.html,200,1024
2025-10-10,12:02:21,192.168.1.14,GET,/products.html,200,850
2025-10-10,12:02:23,192.168.1.15,GET,/about.html,404,512
2025-10-10,12:02:29,192.168.1.16,POST,/checkout,500,128
2025-10-10,12:02:31,192.168.1.17,GET,/images/logo.png,200,256
2025-10-10,12:02:34,192.168.1.18,GET,/contact.html,404,512
2025-10-10,12:02:38,192.168.1.19,POST,/login,403,64
2025-10-10,12:02:41,192.168.1.20,GET,/index.html,200,1024
2025-10-10,12:02:47,192.168.1.21,GET,/products.html,200,850


Writing weblogs.txt


Implement the Mapper

In [2]:
def mapper(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5].strip()
    return [(status, 1)]

Shuffle Phase

In [3]:
from collections import defaultdict

def shuffle(mapped_data):
    grouped = defaultdict(list)
    for key, value in mapped_data:
        grouped[key].append(value)
    return grouped


Reducer Phase

In [4]:
def reducer(grouped_data):
    return {k: sum(v) for k, v in grouped_data.items()}


Combine the Phases

In [5]:
mapped = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped.extend(mapper(line))

grouped = shuffle(mapped)
reduced = reducer(grouped)

for code, count in sorted(reduced.items(), key=lambda x: int(x[0])):
    print(f"HTTP {code}: {count} requests")


HTTP 200: 10 requests
HTTP 403: 2 requests
HTTP 404: 5 requests
HTTP 500: 3 requests


In [6]:
def mapper_url(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    url = fields[4].strip()
    return [(url, 1)]

mapped_url = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_url.extend(mapper_url(line))

grouped_url = shuffle(mapped_url)
reduced_url = reducer(grouped_url)

for url, count in sorted(reduced_url.items(), key=lambda x: x[1], reverse=True):
    print(f"{url}: {count} requests")

/index.html: 5 requests
/products.html: 3 requests
/contact.html: 3 requests
/checkout: 3 requests
/images/logo.png: 2 requests
/about.html: 2 requests
/login: 2 requests


In [7]:
def mapper_size(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5].strip()
    try:
        size = int(fields[6].strip())
    except:
        size = 0
    return [(status, size)]

mapped_size = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_size.extend(mapper_size(line))

grouped_size = shuffle(mapped_size)
reduced_size = reducer(grouped_size)

for status, total_size in sorted(reduced_size.items(), key=lambda x: int(x[0])):
    print(f"HTTP {status}: total response size = {total_size}")

HTTP 200: total response size = 8182
HTTP 403: total response size = 128
HTTP 404: total response size = 2560
HTTP 500: total response size = 384


In [None]:
def mapper_errors(line):
    fields = line.strip().split(',')
    if len(fields) != 7 or fields[0].startswith('#'):
        return []
    status = fields[5].strip()
    if status == '200':
        return []
    return [(status, 1)]

mapped_errors = []
with open("weblogs.txt", "r") as f:
    for line in f:
        mapped_errors.extend(mapper_errors(line))

grouped_errors = shuffle(mapped_errors)
reduced_errors = reducer(grouped_errors)

for status, count in sorted(reduced_errors.items(), key=lambda x: int(x[0])):
    print(f"HTTP {status}: {count} error requests")



HTTP 403: 2 error requests
HTTP 404: 5 error requests
HTTP 500: 3 error requests
