In [1]:
!pip install faker



In [2]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as _sum

# Создаем SparkSession
spark = SparkSession.builder.appName("WebServerLogAnalysis").getOrCreate()

# Загружаем CSV
logs = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)

# 1. Top 10 active IP addresses
top_ips = (
    logs.groupBy("ip")
    .agg(count("ip").alias("request_count"))
    .orderBy(col("request_count").desc())
    .limit(10)
)

print("Top 10 active IP addresses:")
top_ips.show()

# 2. Request count by HTTP method
method_count = logs.groupBy("method").agg(count("method").alias("method_count"))

print("Request count by HTTP method:")
method_count.show()

# 3. Number of 404 response codes
count_404 = logs.filter(col("response_code") == 404).count()

print(f"Number of 404 response codes: {count_404}")

# 4. Total response size by day
response_size_by_day = (
    logs.withColumn("date", col("timestamp").substr(1, 10))
    .groupBy("date")
    .agg(_sum("response_size").alias("total_response_size"))
    .orderBy("date")
)

print("Total response size by day:")
response_size_by_day.show()

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
|  63.119.98.151|            2|
|  100.156.45.89|            1|
|   186.66.74.46|            1|
| 173.158.234.82|            1|
|  55.216.39.119|            1|
|   77.61.168.79|            1|
|  196.119.38.32|            1|
| 46.199.250.238|            1|
|152.167.203.215|            1|
|  29.98.184.231|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       25176|
|DELETE|       24870|
|   PUT|       24958|
|   GET|       24996|
+------+------------+

Number of 404 response codes: 25026
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2024-01-01|            1558613|
|2024-01-02|            1338162|
|2024-01-03|            1416670|
|2024-01-04|            1493858|
|2024-01-05|         