In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=3ec085ce62e882e794d0d1bff3180d6a9d4ca74f9089c4759e76f04c62c21026
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,avg,max
# Initialize spark session
spark= SparkSession.builder.appName('FitnessTracker').getOrCreate()

data = spark.read.csv("/content/pyspark_exe.csv", header=True, inferSchema=True)


In [26]:
# 1.Calculate total steps for each user
total_steps = data.groupBy("user_id").agg(sum("steps").alias("total_steps"))
print("Total steps of each user")
total_steps.show()

# 2.Filter Days Where a User Burned More Than 500 Calories
burned_500_calories = data.filter(data["calories"] > 500)
print("Days where user burned more than 500 calories")
burned_500_calories.show()

# 3. Calculate the Average Distance Traveled by Each User
average_distance = data.groupBy("user_id").agg(avg("distance_km").alias("avg_distance"))
print("Average distance traveled by each user")
average_distance.show()

# 4. Identify the Day with the Maximum Steps for Each User
max_steps_per_user = data.groupBy("user_id", "date").agg(max("steps").alias("max_steps"))
print("Day with maximum steps: ")
max_steps_per_user.show()

# 5. Find Users Who Were Active for More Than 100 Minutes on Any Day
active_users = data.filter(data["active_minutes"] > 100)
print("Users who were active for more than 100 minutes on any day")
active_users.show()

# 6.  Calculate the Total Calories Burned per Day
total_calories_per_day = data.groupBy("date").agg(sum("calories").alias("total_calories"))
print("Total calories burned per day")
total_calories_per_day.show()

# 7. Calculate the Average Steps per Day
average_steps_per_day = data.groupBy("date").agg(avg("steps").alias("avg_steps"))
print("Average steps per day")
average_steps_per_day.show()

# 8.Rank Users by Total Distance Travelled
from pyspark.sql import functions as F
from pyspark.sql.window import Window
total_distance = data.groupBy("user_id").agg(F.sum("distance_km").alias("total_distance"))
window_spec = Window.orderBy(F.col("total_distance").desc())
ranked_users = total_distance.withColumn("rank", F.rank().over(window_spec))
print("Rank of the users based on distance travelled: ")
ranked_users.show()

# 9. Find the Most Active User by Total Active Minutes
most_active_user = data.groupBy("user_id").agg(F.sum("active_minutes").alias("total_active_minutes"))
most_active_user = most_active_user.orderBy(F.col("total_active_minutes").desc())
print("Most active user by total active minutes: ")
most_active_user.show(1)

# 10. Create a New Column for Calories Burned per Kilometer
data = data.withColumn("calories_per_km", data["calories"] / data["distance_km"])
print("New column for calories burned per kilometer")
data.show()

Total steps of each user
+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      33000|
|      3|      44000|
|      2|      24000|
+-------+-----------+

Days where user burned more than 500 calories
+-------+----------+-----+--------+-----------+--------------+
|user_id|      date|steps|calories|distance_km|active_minutes|
+-------+----------+-----+--------+-----------+--------------+
|      3|2023-07-01|15000|     600|       10.2|           120|
|      3|2023-07-02|13000|     520|        9.0|           100|
|      3|2023-07-03|16000|     620|       11.0|           130|
+-------+----------+-----+--------+-----------+--------------+

Average distance traveled by each user
+-------+------------------+
|user_id|      avg_distance|
+-------+------------------+
|      1| 7.833333333333333|
|      3|10.066666666666666|
|      2| 5.566666666666667|
+-------+------------------+

Day with maximum steps: 
+-------+----------+---------+
|user_id|      date|max_steps|
+-