In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

spark = SparkSession.builder.appName("test").getOrCreate()

schema = StructType([
    StructField("ip", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("http", StringType(), True),
    StructField("URL", StringType(), True),
    StructField("code", IntegerType(), True),
    StructField("size", IntegerType(), True)])
df = spark.read.option("header", "true").option("delimiter", ",").schema(schema).csv("web_server_logs.csv")
df.createOrReplaceTempView("temp_table")
request = spark.sql("""
                    SELECT ip, count(*) as request_count FROM temp_table GROUP BY ip ORDER BY count(*) desc LIMIT 10""")
request2 = spark.sql("""
                    SELECT http, count(*) as method_count FROM temp_table GROUP BY http ORDER BY count(*) """)
count_404 = df.filter(df.code == 404).count()
request3 = spark.sql("""
            SELECT date(timestamp) as date, sum(size) as total_response_size FROM temp_table GROUP BY date(timestamp) ORDER BY date """)
print("Top 10 active IP addresses:")
request.show()
print("Request count by HTTP method:")
request2.show()
print(f'Number of 404 codes: {count_404}')
print("Total response size by day:")
request3.show()