In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_replace

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

schema = StructType(
    [
        StructField("ip", StringType(), True),
        StructField("col2", StringType(), True),
        StructField("col3", StringType(), True),
        StructField("date_time", StringType(), True),
        StructField("request", StringType(), True),
        StructField("status_code", IntegerType(), True),
        StructField("col7", IntegerType(), True),
        StructField("url", StringType(), True),
        StructField("user_agent", StringType(), True),
        StructField("col10", IntegerType(), True),
    ]
)

df_logs = spark.read.csv(path="logs.csv", sep=" ", schema=schema)
df_logs.show(n=10, truncate=False)

+-------------+----+----+----------------------+------------------------------------------------------------------------------------------------------+-----------+----+-------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+-----+
|ip           |col2|col3|date_time             |request                                                                                               |status_code|col7|url                                                                                                          |user_agent                                                                                                |col10|
+-------------+----+----+----------------------+------------------------------------------------------------------------------------------------------+-----------+----+------------------------------------------------

In [5]:
print("Count of 404 status_code:", df_logs[df_logs.status_code == 404].count())

Count of 404 status_code: 244


In [6]:
print("Unique urls count:", df_logs.select(df_logs.url).distinct().count())

Unique urls count: 926


In [7]:
df = df_logs.withColumn("date", split(df_logs.date_time, ":").getItem(0)).withColumn("time", split(df_logs.date_time, ":", 2).getItem(1)).drop("date_time")
df.withColumn("date", regexp_replace(df.date, "\[", "")).withColumn("time", regexp_replace(df.time, "\]", "")).show()

+--------------+----+----+--------------------+-----------+----+--------------------+--------------------+-----+-----------+--------+
|            ip|col2|col3|             request|status_code|col7|                 url|          user_agent|col10|       date|    time|
+--------------+----+----+--------------------+-----------+----+--------------------+--------------------+-----+-----------+--------+
| 209.160.24.63|   -|   -|GET /product.scre...|        200|3878|http://www.google...|Mozilla/5.0 (Wind...|  349|03/Mar/2016|18:22:16|
| 209.160.24.63|   -|   -|GET /oldlink?item...|        200|1748|http://www.explor...|Mozilla/5.0 (Wind...|  731|03/Mar/2016|18:22:16|
| 209.160.24.63|   -|   -|GET /product.scre...|        200|2550|http://www.explor...|Mozilla/5.0 (Wind...|  422|03/Mar/2016|18:22:17|
| 209.160.24.63|   -|   -|POST /category.sc...|        200| 407|http://www.explor...|Mozilla/5.0 (Wind...|  211|03/Mar/2016|18:22:19|
| 209.160.24.63|   -|   -|GET /product.scre...|        200|204

In [8]:
print("Status code unique count:", df_logs.select(df_logs.status_code).distinct().count())

Status code unique count: 8
