In [1]:
! pip install pyspark 

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=0f37b541bda9b024ce45bf920435021a2739c43d45c93450bd4242b07f3e4b67
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Health and Fitness") \
    .getOrCreate()

In [3]:
#1. Find the Total Steps Taken by Each User
from pyspark.sql.functions import col, sum
df_fitness = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/content/sample_data/health_data.csv")
total_steps_per_user = df_fitness.groupBy("user_id").agg(sum("steps").alias("total_steps"))
total_steps_per_user.show()

+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      35000|
|      3|      45000|
|      2|      29500|
+-------+-----------+



In [4]:
#2. Filter Days with More Than 10,000 Steps
df_high_steps = df_fitness.filter(col("steps") > 10000)
df_high_steps.show()

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|
+-------+----------+-----+---------------+--------------+------------+



In [5]:
#3. Calculate the Average Calories Burned by Workout Type
from pyspark.sql.functions import avg
average_calories_by_workout = df_fitness.groupBy("workout_type").agg(avg("calories_burned").alias("average_calories"))
average_calories_by_workout.show()

+------------+-----------------+
|workout_type| average_calories|
+------------+-----------------+
|    Strength|            500.0|
|        Yoga|573.3333333333334|
|      Cardio|            537.5|
+------------+-----------------+



In [8]:
#4. Identify the Day with the Most Steps for Each User
from pyspark.sql.functions import col, max
max_steps_per_user = df_fitness.groupBy("user_id").agg(max("steps").alias("max_steps"))
most_steps_per_day = df_fitness.alias("df").join(
    max_steps_per_user.alias("max_steps"),
    (col("df.user_id") == col("max_steps.user_id")) & (col("df.steps") == col("max_steps.max_steps"))
)
most_steps_per_day = most_steps_per_day.select(col("df.user_id"), col("df.date"), col("df.steps"))
most_steps_per_day.show()

+-------+----------+-----+
|user_id|      date|steps|
+-------+----------+-----+
|      1|2023-09-03|13000|
|      2|2023-09-03|12000|
|      3|2023-09-03|16000|
+-------+----------+-----+



In [9]:
#5. Find Users Who Burned More Than 600 Calories on Any Day
users_high_calories = df_fitness.filter(col("calories_burned") > 600).select("user_id").distinct()
users_high_calories.show()

+-------+
|user_id|
+-------+
|      3|
+-------+



In [10]:
#6. Calculate the Average Hours of Sleep per User
average_sleep_per_user = df_fitness.groupBy("user_id").agg(avg("hours_of_sleep").alias("average_sleep"))
average_sleep_per_user.show()

+-------+-----------------+
|user_id|    average_sleep|
+-------+-----------------+
|      1|              7.0|
|      3|              7.5|
|      2|6.666666666666667|
+-------+-----------------+



In [11]:
#7. Find the Total Calories Burned per Day
total_calories_per_day = df_fitness.groupBy("date").agg(sum("calories_burned").alias("total_calories"))
total_calories_per_day.show()

+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-09-03|          1770|
|2023-09-01|          1550|
|2023-09-02|          1550|
+----------+--------------+



In [12]:
#8. Identify Users Who Did Different Types of Workouts
from pyspark.sql.functions import countDistinct
distinct_workouts_per_user = df_fitness.groupBy("user_id").agg(countDistinct("workout_type").alias("distinct_workouts"))
users_multiple_workouts = distinct_workouts_per_user.filter(col("distinct_workouts") > 1)
users_multiple_workouts.show()

+-------+-----------------+
|user_id|distinct_workouts|
+-------+-----------------+
|      1|                2|
|      3|                3|
|      2|                3|
+-------+-----------------+



In [13]:
#9. Calculate the Total Number of Workouts per User
workouts_per_user = df_fitness.groupBy("user_id").count().alias("total_workouts")
workouts_per_user.show()

+-------+-----+
|user_id|count|
+-------+-----+
|      1|    3|
|      3|    3|
|      2|    3|
+-------+-----+



In [14]:
#10. Create a New Column for "Active" Days
from pyspark.sql.functions import when
df_fitness = df_fitness.withColumn("active_day", when(col("steps") > 10000, "Active").otherwise("Inactive"))
df_fitness.show()

+-------+----------+-----+---------------+--------------+------------+----------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|active_day|
+-------+----------+-----+---------------+--------------+------------+----------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|    Active|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|  Inactive|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|    Active|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|  Inactive|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|  Inactive|
|      3|2023-09-02|14000|            600|           7.5|    Strength|    Active|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|    Active|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|    Active|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|    Active|
+-------+-------