In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
        SparkSession.builder
        .appName("Final_task")
        .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

data_path = ("/home/ilya/github/DE_course_repo/"
             "python_projects/PySpark/data/web_server_logs.csv")

log_df = spark.read.csv(data_path,
                        header=True,
                        inferSchema=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/21 15:57:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
log_df.show(10)
log_df.printSchema()

                                                                                

+--------------+--------------------+------+--------------------+-------------+-------------+
|            ip|           timestamp|method|                 url|response_code|response_size|
+--------------+--------------------+------+--------------------+-------------+-------------+
| 92.111.222.15|2025-03-28 12:40:...|  POST|                blog|          200|         3002|
|122.94.103.244|2025-04-09 02:59:...|   PUT|    explore/list/tag|          404|          273|
| 106.41.49.227|2025-01-25 15:21:...|DELETE|                 app|          200|         7593|
|  17.114.11.73|2025-02-12 00:59:...|DELETE|                list|          404|         5827|
|41.161.233.219|2025-02-07 13:50:...|   GET|                blog|          200|         5258|
| 83.137.101.86|2025-04-15 07:02:...|  POST|search/categories...|          500|         7511|
|46.170.211.136|2025-03-25 05:09:...|   PUT|                tags|          500|         3407|
| 212.16.89.164|2025-01-24 06:11:...|DELETE|      posts/cate

#### Задание 1
Сгруппируйте данные по IP и посчитайте количество запросов для каждого IP, выводим 10 самых активных IP.

In [4]:
top_10_ip = (log_df.groupBy("ip").agg(F.count("method").alias("request_count"))
             .orderBy(F.desc("request_count")).limit(10))

top_10_ip.show()

                                                                                

+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
| 175.32.115.101|            2|
|   129.42.51.11|            1|
|  200.126.67.24|            1|
|   123.38.37.85|            1|
|165.254.201.100|            1|
|    1.138.24.93|            1|
| 44.189.175.133|            1|
|102.155.100.203|            1|
|  28.10.138.224|            1|
|    9.42.115.60|            1|
+---------------+-------------+



#### Задание 2
Сгруппируйте данные по HTTP-методу и посчитайте количество запросов для каждого метода.

In [5]:
method_count = log_df.groupBy("method").agg(F.count("ip").alias("method_count"))

method_count.show()



+------+------------+
|method|method_count|
+------+------------+
|  POST|       24937|
|DELETE|       24985|
|   PUT|       25105|
|   GET|       24973|
+------+------------+



                                                                                

#### Задание 3
Профильтруйте и посчитайте количество запросов с кодом ответа 404.

In [7]:
response_404 = log_df.filter(log_df["response_code"] == 404).count()

print(f"Number of 404 response codes: {response_404}")



Number of 404 response codes: 25047


                                                                                

#### Задание 4
Сгруппируйте данные по дате и просуммируйте размер ответов, сортируйте по дате.

In [8]:
resp_size = (log_df.withColumn("date", F.to_date("timestamp"))
             .groupBy("date")
             .agg(F.sum("response_size").alias("total_response_size"))
             .orderBy("date"))

resp_size.show()



+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            4099640|
|2025-01-02|            4540705|
|2025-01-03|            4444770|
|2025-01-04|            4899352|
|2025-01-05|            4687473|
|2025-01-06|            4290588|
|2025-01-07|            4427677|
|2025-01-08|            4617443|
|2025-01-09|            4658200|
|2025-01-10|            4469333|
|2025-01-11|            4652802|
|2025-01-12|            4438437|
|2025-01-13|            4488098|
|2025-01-14|            4502569|
|2025-01-15|            4522624|
|2025-01-16|            4364922|
|2025-01-17|            4620184|
|2025-01-18|            4637608|
|2025-01-19|            4720734|
|2025-01-20|            4491932|
+----------+-------------------+
only showing top 20 rows



                                                                                

#### Вывод

In [12]:
print("Top 10 active IP addresses:")
top_10_ip.show()

print("Request count by HTTP method:")
method_count.show()

print(f"Number of 404 response codes: {response_404}\n")

print("Total response size by day:")
resp_size.show()

Top 10 active IP addresses:


                                                                                

+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
| 175.32.115.101|            2|
|   129.42.51.11|            1|
|  200.126.67.24|            1|
|   123.38.37.85|            1|
|165.254.201.100|            1|
|    1.138.24.93|            1|
| 44.189.175.133|            1|
|102.155.100.203|            1|
|  28.10.138.224|            1|
|    9.42.115.60|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24937|
|DELETE|       24985|
|   PUT|       25105|
|   GET|       24973|
+------+------------+

Number of 404 response codes: 25047

Total response size by day:




+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            4099640|
|2025-01-02|            4540705|
|2025-01-03|            4444770|
|2025-01-04|            4899352|
|2025-01-05|            4687473|
|2025-01-06|            4290588|
|2025-01-07|            4427677|
|2025-01-08|            4617443|
|2025-01-09|            4658200|
|2025-01-10|            4469333|
|2025-01-11|            4652802|
|2025-01-12|            4438437|
|2025-01-13|            4488098|
|2025-01-14|            4502569|
|2025-01-15|            4522624|
|2025-01-16|            4364922|
|2025-01-17|            4620184|
|2025-01-18|            4637608|
|2025-01-19|            4720734|
|2025-01-20|            4491932|
+----------+-------------------+
only showing top 20 rows



                                                                                