In [241]:
from pyspark.sql.functions import col, lit, avg
import datetime

In [242]:
user_log_data = sc.textFile("hdfs://localhost:54310/user_log_data/user_log_data.csv").map(lambda x:x.split(","))

In [243]:
# creating dataframe from csv file
# csv filtered to extract columns and values
# in schema index 0 is to collect first row, which is headers
user_log_dataframe = sqlContext.createDataFrame(data=user_log_data.filter(lambda x:x[0]!='user_name'),
                             schema=user_log_data.filter(lambda x:x[0]=='user_name').collect()[0])

In [258]:
# creating new column list to store extracted seconds and username(as primary key) from dataframe 
new_column_list = []
for row in user_log_dataframe.collect():
    # converting string type of datetime to datetime
    date_time = datetime.datetime.strptime(row['idle_time'], "%Y-%m-%d %H:%M:%S")
    converted_seconds = date_time.hour * 3600 + date_time.minute *60 + date_time.second
    new_column_list.append((row['user_name'], converted_seconds))

In [246]:
# creating new column dataframe to join with original dataframe( user_log_dataframe)
new_column = sqlContext.createDataFrame(new_column_list, ('username',"seconds"))

In [248]:
# joining new column to user_log_dataframe using username as primary key
joined_dataframe = (user_log_dataframe
    .join(new_column, col("user_name")==col("username"),"leftouter")
    .drop("username")
   )

In [251]:
# Generating average seconds
average_object = joined_dataframe.agg(avg(col("seconds")))

In [252]:
# unpacking average value from average_object
average_value = average_object.collect()[0][0]

In [253]:
# filtering dataframe with highest idle time users
highest_idle_time = joined_dataframe.filter(joined_dataframe['seconds'] > average_value)

In [254]:
# getting total number of highest idle time users
highest_idle_time.count()

27

In [257]:
# printing top 20 users with highest idle time
highest_idle_time.select("user_name").show()

+--------------------+
|           user_name|
+--------------------+
|       nikitapawar17|
|“shivnajalisangal...|
|gaikwadr576@gmail...|
|magadum.iranna@gm...|
| dileep.bs@yahoo.com|
|puruissimple@gmai...|
|hakepratiksha55@g...|
|tekina.makin@gmai...|
| addyp1911@gmail.com|
| blsonalib@gmail.com|
|20150773@dbatu.ac.in|
|farooqbassam4@gma...|
|ruchikachile30199...|
|polelaxman001@gma...|
|ayush.saraf47@gma...|
|surajpj7852@gmail...|
|     you@example.com|
|vishnu23kumar@gma...|
|  sahil24c@gmail.com|
|sargampandey27oct...|
+--------------------+
only showing top 20 rows

