In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, hour, count

In [None]:
spark = SparkSession.builder.appName("PipelineLogs").getOrCreate()

In [None]:
dataframe_logs = spark.read.option("header", True).csv("./data/logs_web.csv")
dataframe_logs.show(5)

In [None]:
dataframe_logs = dataframe_logs.withColumn("url_path", split(col("url"), "/").getItem(1))

In [None]:
dataframe_stats = dataframe_logs.groupBy(hour(col("timestamp")).alias("date"), col("user_id")).agg(count("*").alias("visits"))
dataframe_stats.show()

In [None]:
dataframe_by_browser = dataframe_logs.groupBy(col("user_agent")).agg(count("*").alias("visits_per_user_agent"))
dataframe_by_browser.show()

In [None]:
dataframe_top_ten_ulrs = dataframe_logs.groupBy(col("url")).agg(count("*").alias("visits_per_url")).orderBy(col("visits_per_url").desc()).limit(10)
dataframe_top_ten_ulrs.show()

In [None]:
print(f"Nombre total d'enregistrements : {dataframe_logs.count():,}")
print(f"Nombre de partitions RDD       : {dataframe_logs.rdd.getNumPartitions()}")

In [None]:
print(f"\nStatistiques horaires (par heure et user_id):")
print(f"   - Enregistrements : {dataframe_stats.count()}")
print(f"   - Partitions      : {dataframe_stats.rdd.getNumPartitions()}")

print(f"\nStatistiques par navigateur:")
print(f"   - Enregistrements : {dataframe_by_browser.count()}")
print(f"   - Partitions      : {dataframe_by_browser.rdd.getNumPartitions()}")

print(f"\nTop 10 URLs:")
print(f"   - Enregistrements : {dataframe_top_ten_ulrs.count()}")
print(f"   - Partitions      : {dataframe_top_ten_ulrs.rdd.getNumPartitions()}")

In [None]:
# Sauvegarde des statistiques horaires (par heure et user_id)
os.makedirs("./data/output/logs_hourly/", exist_ok=True)
dataframe_stats.write.mode("overwrite").parquet("./data/output/logs_hourly/logs_hourly.parquet")

# Sauvegarde des statistiques par navigateur
os.makedirs("./data/output/logs_by_browser/", exist_ok=True)
dataframe_by_browser.write.mode("overwrite").parquet("./data/output/logs_by_browser/logs_by_browser.parquet")

# Sauvegarde des 10 URLs les plus visit√©es
os.makedirs("./data/output/logs_top_urls/", exist_ok=True)
dataframe_top_ten_ulrs.write.mode("overwrite").parquet("./data/output/logs_top_urls/logs_top_urls.parquet")