In [1]:
import pandas as pd

In [2]:
data = [
    {"id": 1, "timestamp": 1562007679},
    {"id": 1, "timestamp": 1562007710},
    {"id": 1, "timestamp": 1562007720},
    {"id": 1, "timestamp": 1562007750},
    {"id": 2, "timestamp": 1564682430},
    {"id": 2, "timestamp": 1564682450},
    {"id": 2, "timestamp": 1564682480},
]

df = pd.DataFrame(data)
df.to_csv('df1.csv', index=False)
df

Unnamed: 0,id,timestamp
0,1,1562007679
1,1,1562007710
2,1,1562007720
3,1,1562007750
4,2,1564682430
5,2,1564682450
6,2,1564682480


In [3]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder \
  .master("local[1]") \
  .appName("SessionLength") \
  .config("spark.executor.memory", "10g")\
  .config("spark.executor.cores", 5) \
  .config("spark.dynamicAllocation.enabled", "true") \
  .config("spark.dynamicAllocation.maxExecutors", 5) \
  .config("spark.shuffle.service.enabled", "true") \
  .getOrCreate()

In [5]:
df1 = spark.read.format("csv").option("header", True).load("df1.csv")
df1.show()

+---+----------+
| id| timestamp|
+---+----------+
|  1|1562007679|
|  1|1562007710|
|  1|1562007720|
|  1|1562007750|
|  2|1564682430|
|  2|1564682450|
|  2|1564682480|
+---+----------+



In [6]:
w = Window.partitionBy("id").orderBy("timestamp")

df_sessions = df1.select("*", min("timestamp").over(w).alias("start_time"), 
                                 max("timestamp").over(w).alias("end_time"))
df_sessions.show()


+---+----------+----------+----------+
| id| timestamp|start_time|  end_time|
+---+----------+----------+----------+
|  1|1562007679|1562007679|1562007679|
|  1|1562007710|1562007679|1562007710|
|  1|1562007720|1562007679|1562007720|
|  1|1562007750|1562007679|1562007750|
|  2|1564682430|1564682430|1564682430|
|  2|1564682450|1564682430|1564682450|
|  2|1564682480|1564682430|1564682480|
+---+----------+----------+----------+



In [7]:
df_sessions = df_sessions.withColumn("day", to_date(from_unixtime("timestamp")))
df_sessions.show()

+---+----------+----------+----------+----------+
| id| timestamp|start_time|  end_time|       day|
+---+----------+----------+----------+----------+
|  1|1562007679|1562007679|1562007679|2019-07-02|
|  1|1562007710|1562007679|1562007710|2019-07-02|
|  1|1562007720|1562007679|1562007720|2019-07-02|
|  1|1562007750|1562007679|1562007750|2019-07-02|
|  2|1564682430|1564682430|1564682430|2019-08-01|
|  2|1564682450|1564682430|1564682450|2019-08-01|
|  2|1564682480|1564682430|1564682480|2019-08-01|
+---+----------+----------+----------+----------+



In [8]:
w = Window.partitionBy("id", "day").orderBy("timestamp")
df_sessions_avg = df_sessions.select("*",
                  (max("end_time").over(w) - min("start_time").over(w)).alias("session_length"))
df_sessions_avg = df_sessions_avg.groupBy("id", "day")\
                  .agg(round(avg("session_length"), 2).alias("avg_session_length")).show()

+---+----------+------------------+
| id|       day|avg_session_length|
+---+----------+------------------+
|  1|2019-07-02|             35.75|
|  2|2019-08-01|             23.33|
+---+----------+------------------+



In [9]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x000002A37CC76150>>