In [39]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])
    
    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)
        
        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, sum

spark = SparkSession.builder \
    .appName("WeatherAnalysis") \
    .getOrCreate()

logs_df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)
logs_df = logs_df.withColumn("timestamp", to_date(col("timestamp"), "yyyy-MM-dd"))
logs_df.show()

+---------------+----------+------+--------------------+-------------+-------------+
|             ip| timestamp|method|                 url|response_code|response_size|
+---------------+----------+------+--------------------+-------------+-------------+
| 183.183.13.151|2024-01-23|   PUT|           blog/tags|          404|         8076|
|159.109.154.147|2024-04-27|DELETE| category/wp-content|          500|         8897|
| 133.72.171.216|2024-05-09|  POST|   wp-content/search|          404|         9550|
|   113.48.3.172|2024-04-01|   PUT|      app/blog/posts|          200|         2872|
| 56.195.118.103|2024-04-22|   PUT|     blog/app/search|          301|         5033|
|  84.43.197.203|2024-07-13|  POST|                tags|          200|         8134|
|205.203.138.131|2024-04-15|  POST|   tags/blog/explore|          200|         9562|
|    30.4.115.99|2024-03-29|   GET|blog/categories/blog|          404|         2021|
|  158.14.22.130|2024-05-10|DELETE|       blog/tags/tag|         

                                                                                

In [41]:
ip_count = logs_df.groupBy("ip").count().orderBy(col("count").desc()).limit(10)
print("Top 10 active IP addrsses:")
ip_count.show()

Top 10 active IP addrsses:
+---------------+-----+
|             ip|count|
+---------------+-----+
|   23.110.62.61|    2|
|  14.107.19.211|    1|
|    128.38.3.30|    1|
|  146.55.174.43|    1|
|118.163.195.241|    1|
|   72.42.178.65|    1|
|109.103.244.129|    1|
|   79.186.23.83|    1|
|113.137.238.112|    1|
|   99.49.39.103|    1|
+---------------+-----+



In [42]:
method_count = logs_df.groupBy("method").count()
print("Request count by HTTP method:")
method_count.show()

Request count by HTTP method:
+------+-----+
|method|count|
+------+-----+
|  POST|25019|
|DELETE|25025|
|   PUT|24927|
|   GET|25029|
+------+-----+



In [43]:
number_of_404 = logs_df.filter(col("response_code") == "404").count()
print(f"Number of 404 response codes: {number_of_404}")

Number of 404 response codes: 24991


In [45]:
total_response_size = logs_df.groupBy("timestamp").agg(sum("response_size").alias("total_response_size")).orderBy(col("timestamp").asc())
print("Total response size by day:")
total_response_size.show()

Total response size by day:
+----------+-------------------+
| timestamp|total_response_size|
+----------+-------------------+
|2024-01-01|            1789809|
|2024-01-02|            1757065|
|2024-01-03|            1881556|
|2024-01-04|            1783238|
|2024-01-05|            1855899|
|2024-01-06|            1713404|
|2024-01-07|            1841000|
|2024-01-08|            1723272|
|2024-01-09|            1853559|
|2024-01-10|            1785170|
|2024-01-11|            1760211|
|2024-01-12|            1966099|
|2024-01-13|            1840408|
|2024-01-14|            1613810|
|2024-01-15|            1810126|
|2024-01-16|            1782394|
|2024-01-17|            1826700|
|2024-01-18|            1787218|
|2024-01-19|            2049212|
|2024-01-20|            1906105|
+----------+-------------------+
only showing top 20 rows



In [46]:
spark.stop()