In [0]:
# creating a spark session

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName("Log_Analysis").getOrCreate()

In [0]:
# writing code to get the data from azure storage account into databricks

In [0]:
storage_account_name = "myproject1storage1055"
storage_account_key = "f6NTou9O4IHS84J8B2KF6R/M22p4fZZ/0kIJFhbwmgi4gawI3p1zM01w6+3A7xgHQ2CVHZAPxanz+AStJork3w=="
container = "mynewcontainer"

In [0]:
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

In [0]:
dbutils.fs.ls(f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/")

[FileInfo(path='wasbs://mynewcontainer@myproject1storage1055.blob.core.windows.net/log_data.csv', name='log_data.csv', size=67110, modificationTime=1708837389000)]

In [0]:
# loading the data into dataframe

In [0]:
df = spark.read.csv(f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/", header=True, inferSchema=True)

In [0]:
df.show(6)

+--------------------+---------------+--------------+-----------+----------+
|           Timestamp|     IP Address|    User Agent|HTTP Status|Bytes Sent|
+--------------------+---------------+--------------+-----------+----------+
|2023-08-11 17:48:...| 192.168.165.48|UserAgent-5288|        404|      4447|
|2023-09-15 03:49:...|192.168.156.225|UserAgent-6865|        404|      4975|
|2023-04-06 19:26:...|192.168.137.242|UserAgent-8620|        404|      9168|
|2023-10-26 16:08:...|192.168.116.143|UserAgent-8896|        200|      3665|
|2024-01-27 08:56:...|192.168.233.122|UserAgent-9211|        200|      6421|
|2023-08-23 17:32:...| 192.168.164.33|UserAgent-7927|        200|      9650|
+--------------------+---------------+--------------+-----------+----------+
only showing top 6 rows



In [0]:
# Instead of explicitly specifying the full path every time to access the data, we can use mounting

In [0]:
dbutils.fs.mount(
    source=f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/",
    mount_point="/mnt/storage",
    extra_configs={f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}
)

True

In [0]:
dbutils.fs.mounts()

[MountInfo(mountPoint='/mnt/storage', source='wasbs://mynewcontainer@myproject1storage1055.blob.core.windows.net/', encryptionType=''),
 MountInfo(mountPoint='/databricks-datasets', source='databricks-datasets', encryptionType=''),
 MountInfo(mountPoint='/Volumes', source='UnityCatalogVolumes', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-tracking', source='databricks/mlflow-tracking', encryptionType=''),
 MountInfo(mountPoint='/databricks-results', source='databricks-results', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-registry', source='databricks/mlflow-registry', encryptionType=''),
 MountInfo(mountPoint='/Volume', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/volumes', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/', source='DatabricksRoot', encryptionType=''),
 MountInfo(mountPoint='/volume', source='DbfsReserved', encryptionType='')]

In [0]:
df = spark.read.csv("/mnt/storage/log_data.csv", header=True, inferSchema=True)

In [0]:
df.show(7)

+--------------------+---------------+--------------+-----------+----------+
|           Timestamp|     IP Address|    User Agent|HTTP Status|Bytes Sent|
+--------------------+---------------+--------------+-----------+----------+
|2023-08-11 17:48:...| 192.168.165.48|UserAgent-5288|        404|      4447|
|2023-09-15 03:49:...|192.168.156.225|UserAgent-6865|        404|      4975|
|2023-04-06 19:26:...|192.168.137.242|UserAgent-8620|        404|      9168|
|2023-10-26 16:08:...|192.168.116.143|UserAgent-8896|        200|      3665|
|2024-01-27 08:56:...|192.168.233.122|UserAgent-9211|        200|      6421|
|2023-08-23 17:32:...| 192.168.164.33|UserAgent-7927|        200|      9650|
|2023-05-22 15:25:...|   192.168.28.3|UserAgent-4801|        500|      1154|
+--------------------+---------------+--------------+-----------+----------+
only showing top 7 rows



In [0]:
df.printSchema()

root
 |-- Timestamp: timestamp (nullable = true)
 |-- IP Address: string (nullable = true)
 |-- User Agent: string (nullable = true)
 |-- HTTP Status: integer (nullable = true)
 |-- Bytes Sent: integer (nullable = true)



In [0]:
display(df)

Timestamp,IP Address,User Agent,HTTP Status,Bytes Sent
2023-08-11T17:48:25.348734Z,192.168.165.48,UserAgent-5288,404,4447
2023-09-15T03:49:34.348734Z,192.168.156.225,UserAgent-6865,404,4975
2023-04-06T19:26:37.348734Z,192.168.137.242,UserAgent-8620,404,9168
2023-10-26T16:08:19.348734Z,192.168.116.143,UserAgent-8896,200,3665
2024-01-27T08:56:08.348734Z,192.168.233.122,UserAgent-9211,200,6421
2023-08-23T17:32:56.348734Z,192.168.164.33,UserAgent-7927,200,9650
2023-05-22T15:25:44.348734Z,192.168.28.3,UserAgent-4801,500,1154
2023-09-11T21:38:44.348734Z,192.168.18.244,UserAgent-3672,404,4897
2023-10-11T07:21:07.348734Z,192.168.208.203,UserAgent-6127,200,8765
2023-05-08T05:54:30.348734Z,192.168.74.120,UserAgent-2420,500,3170


Databricks visualization. Run in Databricks to view.

In [0]:
# Handling missing values

In [0]:
df = df.na.drop()

In [0]:
# Let's compute basic statistics for numerical columns

In [0]:
df.describe().show()

+-------+-------------+--------------+------------------+------------------+
|summary|   IP Address|    User Agent|       HTTP Status|        Bytes Sent|
+-------+-------------+--------------+------------------+------------------+
|  count|         1000|          1000|              1000|              1000|
|   mean|         NULL|          NULL|           361.244|          5044.013|
| stddev|         NULL|          NULL|123.77022069759556|2899.8028483657986|
|    min|192.168.0.175|UserAgent-1005|               200|               108|
|    max| 192.168.99.6|UserAgent-9991|               500|              9999|
+-------+-------------+--------------+------------------+------------------+



In [0]:
# Finding error rates

In [0]:
total_records = df.count()

In [0]:
from pyspark.sql.functions import col
error_records = df.filter((col('HTTP Status') == 404) | (col('HTTP Status') == 500)).count()

In [0]:
error_rate = (error_records / total_records) * 100
error_rate

65.3

In [0]:
# Analyzing trends
# We can analyze trends by grouping data by a certain time interval

In [0]:
trends = df.groupBy(df.Timestamp.cast("date").alias("Date"), "HTTP Status").count().orderBy("Date")

In [0]:
# Trend of HTTP Status over time
trends.show()

+----------+-----------+-----+
|      Date|HTTP Status|count|
+----------+-----------+-----+
|2023-02-24|        200|    3|
|2023-02-24|        500|    1|
|2023-02-24|        404|    2|
|2023-02-25|        200|    1|
|2023-02-25|        500|    1|
|2023-02-26|        200|    1|
|2023-02-26|        500|    1|
|2023-02-27|        200|    1|
|2023-02-28|        200|    1|
|2023-03-01|        500|    1|
|2023-03-02|        200|    1|
|2023-03-03|        404|    1|
|2023-03-03|        200|    1|
|2023-03-04|        200|    1|
|2023-03-05|        500|    1|
|2023-03-05|        404|    1|
|2023-03-06|        500|    1|
|2023-03-06|        404|    1|
|2023-03-06|        200|    1|
|2023-03-07|        404|    3|
+----------+-----------+-----+
only showing top 20 rows



In [0]:
ip_trends = df.groupBy(df.Timestamp.cast("date").alias("Date"), "IP Address").count().orderBy("Date")

In [0]:
# Trend of Requests from IP Addresses over Time
ip_trends.show()

+----------+---------------+-----+
|      Date|     IP Address|count|
+----------+---------------+-----+
|2023-02-24| 192.168.29.163|    1|
|2023-02-24| 192.168.231.10|    1|
|2023-02-24|192.168.230.228|    1|
|2023-02-24|192.168.252.251|    1|
|2023-02-24|  192.168.46.90|    1|
|2023-02-24|192.168.181.238|    1|
|2023-02-25| 192.168.167.73|    1|
|2023-02-25|  192.168.13.73|    1|
|2023-02-26| 192.168.76.213|    1|
|2023-02-26| 192.168.161.72|    1|
|2023-02-27|  192.168.94.65|    1|
|2023-02-28|192.168.201.229|    1|
|2023-03-01|192.168.171.228|    1|
|2023-03-02| 192.168.162.18|    1|
|2023-03-03| 192.168.66.147|    1|
|2023-03-03|   192.168.13.2|    1|
|2023-03-04| 192.168.88.211|    1|
|2023-03-05|  192.168.227.9|    1|
|2023-03-05| 192.168.133.61|    1|
|2023-03-06|  192.168.38.78|    1|
+----------+---------------+-----+
only showing top 20 rows

