In [2]:
! pip install pyspark



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,avg,max
# Initialize spark session
spark= SparkSession.builder.appName('FitnessTracker').getOrCreate()

data = spark.read.csv("/content/fitness_data.csv", header=True, inferSchema=True)


In [4]:
# 1.Calculate total steps for each user
total_steps = data.groupBy("user_id").agg(sum("steps").alias("total_steps"))
print("Total steps of each user")
total_steps.show()

Total steps of each user
+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      33000|
|      3|      44000|
|      2|      24000|
+-------+-----------+



In [5]:
# 2.Filter Days Where a User Burned More Than 500 Calories
burned_500_calories = data.filter(data["calories"] > 500)
print("Days where user burned more than 500 calories")
burned_500_calories.show()

Days where user burned more than 500 calories
+-------+----------+-----+--------+-----------+--------------+
|user_id|      date|steps|calories|distance_km|active_minutes|
+-------+----------+-----+--------+-----------+--------------+
|      3|2023-07-01|15000|     600|       10.2|           120|
|      3|2023-07-02|13000|     520|        9.0|           100|
|      3|2023-07-03|16000|     620|       11.0|           130|
+-------+----------+-----+--------+-----------+--------------+



In [6]:
# 3. Calculate the Average Distance Traveled by Each User
average_distance = data.groupBy("user_id").agg(avg("distance_km").alias("avg_distance"))
print("Average distance traveled by each user")
average_distance.show()

Average distance traveled by each user
+-------+------------------+
|user_id|      avg_distance|
+-------+------------------+
|      1| 7.833333333333333|
|      3|10.066666666666666|
|      2| 5.566666666666667|
+-------+------------------+



In [7]:
# 4. Identify the Day with the Maximum Steps for Each User
max_steps_per_user = data.groupBy("user_id", "date").agg(max("steps").alias("max_steps"))
print("Day with maximum steps: ")
max_steps_per_user.show()

Day with maximum steps: 
+-------+----------+---------+
|user_id|      date|max_steps|
+-------+----------+---------+
|      2|2023-07-03|     7000|
|      1|2023-07-02|    11000|
|      3|2023-07-02|    13000|
|      2|2023-07-01|     8000|
|      3|2023-07-01|    15000|
|      3|2023-07-03|    16000|
|      2|2023-07-02|     9000|
|      1|2023-07-01|    12000|
|      1|2023-07-03|    10000|
+-------+----------+---------+



In [8]:
# 5. Find Users Who Were Active for More Than 100 Minutes on Any Day
active_users = data.filter(data["active_minutes"] > 100)
print("Users who were active for more than 100 minutes on any day")
active_users.show()

Users who were active for more than 100 minutes on any day
+-------+----------+-----+--------+-----------+--------------+
|user_id|      date|steps|calories|distance_km|active_minutes|
+-------+----------+-----+--------+-----------+--------------+
|      3|2023-07-01|15000|     600|       10.2|           120|
|      3|2023-07-03|16000|     620|       11.0|           130|
+-------+----------+-----+--------+-----------+--------------+



In [9]:
# 6.  Calculate the Total Calories Burned per Day
total_calories_per_day = data.groupBy("date").agg(sum("calories").alias("total_calories"))
print("Total calories burned per day")
total_calories_per_day.show()

Total calories burned per day
+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-07-02|          1400|
|2023-07-03|          1390|
|2023-07-01|          1450|
+----------+--------------+



In [10]:
# 7. Calculate the Average Steps per Day
average_steps_per_day = data.groupBy("date").agg(avg("steps").alias("avg_steps"))
print("Average steps per day")
average_steps_per_day.show()

Average steps per day
+----------+------------------+
|      date|         avg_steps|
+----------+------------------+
|2023-07-02|           11000.0|
|2023-07-03|           11000.0|
|2023-07-01|11666.666666666666|
+----------+------------------+



In [11]:
# 8.Rank Users by Total Distance Travelled
from pyspark.sql import functions as F
from pyspark.sql.window import Window
total_distance = data.groupBy("user_id").agg(F.sum("distance_km").alias("total_distance"))
window_spec = Window.orderBy(F.col("total_distance").desc())
ranked_users = total_distance.withColumn("rank", F.rank().over(window_spec))
print("Rank of the users based on distance travelled: ")
ranked_users.show()

Rank of the users based on distance travelled: 
+-------+------------------+----+
|user_id|    total_distance|rank|
+-------+------------------+----+
|      3|              30.2|   1|
|      1|              23.5|   2|
|      2|16.700000000000003|   3|
+-------+------------------+----+



In [12]:
# 9. Find the Most Active User by Total Active Minutes
most_active_user = data.groupBy("user_id").agg(F.sum("active_minutes").alias("total_active_minutes"))
most_active_user = most_active_user.orderBy(F.col("total_active_minutes").desc())
print("Most active user by total active minutes: ")
most_active_user.show(1)

Most active user by total active minutes: 
+-------+--------------------+
|user_id|total_active_minutes|
+-------+--------------------+
|      3|                 350|
+-------+--------------------+
only showing top 1 row



In [13]:
# 10. Create a New Column for Calories Burned per Kilometer
data = data.withColumn("calories_per_km", data["calories"] / data["distance_km"])
print("New column for calories burned per kilometer")
data.show()

New column for calories burned per kilometer
+-------+----------+-----+--------+-----------+--------------+-----------------+
|user_id|      date|steps|calories|distance_km|active_minutes|  calories_per_km|
+-------+----------+-----+--------+-----------+--------------+-----------------+
|      1|2023-07-01|12000|     500|        8.5|            90| 58.8235294117647|
|      2|2023-07-01| 8000|     350|        5.6|            60|62.50000000000001|
|      3|2023-07-01|15000|     600|       10.2|           120|58.82352941176471|
|      1|2023-07-02|11000|     480|        7.9|            85|60.75949367088607|
|      2|2023-07-02| 9000|     400|        6.2|            70|64.51612903225806|
|      3|2023-07-02|13000|     520|        9.0|           100|57.77777777777778|
|      1|2023-07-03|10000|     450|        7.1|            80|63.38028169014085|
|      2|2023-07-03| 7000|     320|        4.9|            55| 65.3061224489796|
|      3|2023-07-03|16000|     620|       11.0|           130|56