# Load Data:

Load the dataset into a PySpark DataFrame. You can use the spark.read.csv method if your data is in a CSV file.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("UserActivityAnalysis").getOrCreate()

# Replace 'path/to/your/dataset.csv' with the actual path to your dataset
df = spark.read.csv('path/to/your/dataset.csv', header=True, inferSchema=True)


# Explore the Data:

Check the schema of the DataFrame.
Display the first few rows to understand the structure of the data.

df.printSchema()
df.show(5)

# Data Transformation:

Perform any necessary data transformations. For example, you might want to convert the timestamp column to a datetime type.

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import TimestampType

df = df.withColumn('timestamp', col('timestamp').cast(TimestampType()))

# Analysis:

Calculate the total duration spent by each user on the website.
Identify the most visited pages.
Find the average duration of sessions.

In [None]:
from pyspark.sql.functions import sum, avg

total_duration_per_user = df.groupBy('user_id').agg(sum('duration').alias('total_duration'))
most_visited_pages = df.groupBy('page_visited').count().orderBy(col('count').desc())
avg_duration = df.groupBy().agg(avg('duration').alias('average_duration'))

# Output/Save Results:

You can save the results or insights to a file or a database for future reference.

total_duration_per_user.write.csv('path/to/save/total_duration_per_user', header=True)