In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("parq_data") \
    .getOrCreate()


In [4]:
df=spark.read \
    .option('header','true') \
    .option('inferSchema','true') \
    .csv('myFiles/Mental_Health_and_Social_Media_Balance_Dataset.csv')

In [5]:
df.count()

500

In [6]:
import os

os.listdir("myFiles")


['.ipynb_checkpoints',
 'Mental_Health_and_Social_Media_Balance_Dataset.csv',
 'Mental_Health.parquet']

In [7]:
df = spark.read.csv("myFiles/Mental_Health_and_Social_Media_Balance_Dataset.csv", header=True, inferSchema=True)


In [8]:
df.write.parquet('myFiles/Mental_Health.parquet', mode='overwrite')

In [9]:
df.count()

500

In [10]:
df = spark.read.parquet("myFiles/Mental_Health.parquet")
df.show(5)


+-------+---+------+----------------------+-------------------+------------------+-------------------------+------------------------+---------------------+---------------------+
|User_ID|Age|Gender|Daily_Screen_Time(hrs)|Sleep_Quality(1-10)|Stress_Level(1-10)|Days_Without_Social_Media|Exercise_Frequency(week)|Social_Media_Platform|Happiness_Index(1-10)|
+-------+---+------+----------------------+-------------------+------------------+-------------------------+------------------------+---------------------+---------------------+
|   U001| 44|  Male|                   3.1|                7.0|               6.0|                      2.0|                     5.0|             Facebook|                 10.0|
|   U002| 30| Other|                   5.1|                7.0|               8.0|                      5.0|                     3.0|             LinkedIn|                 10.0|
|   U003| 23| Other|                   7.4|                6.0|               7.0|                      1.0|  

In [11]:
df = df.withColumnRenamed("Daily_Screen_Time(hrs)", "Daily_Screen_Time") \
       .withColumnRenamed("Sleep_Quality(1-10)", "Sleep_Quality") \
       .withColumnRenamed("Stress_Level(1-10)", "Stress_Level") \
       .withColumnRenamed("Exercise_Frequency(week)", "Exercise_Frequency") \
       .withColumnRenamed("Happiness_Index(1-10)", "Happiness_Index")
       
from pyspark.sql import Row

new_row = spark.createDataFrame([Row(
    User_ID="U501",
    Age=25,
    Gender="Male",
    Daily_Screen_Time=2,
    Sleep_Quality=8,
    Stress_Level=7,
    Days_Without_Social_Media=1.0,
    Exercise_Frequency=1,
    Social_Media_Platform="Twitter",
    Happiness_Index=1
)])

df_appended = df.union(new_row)
df_appended.show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U001| 44|  Male|              3.1|          7.0|         6.0|                      2.0|               5.0|             Facebook|           10.0|
|   U002| 30| Other|              5.1|          7.0|         8.0|                      5.0|               3.0|             LinkedIn|           10.0|
|   U003| 23| Other|              7.4|          6.0|         7.0|                      1.0|               3.0|              YouTube|            6.0|
|   U004| 36|Female|              5.7|          7.0|         8.0|                      1.0|               

In [12]:
df_appended.select("User_ID","Age").show(501)

+-------+---+
|User_ID|Age|
+-------+---+
|   U001| 44|
|   U002| 30|
|   U003| 23|
|   U004| 36|
|   U005| 34|
|   U006| 38|
|   U007| 26|
|   U008| 26|
|   U009| 39|
|   U010| 39|
|   U011| 18|
|   U012| 37|
|   U013| 17|
|   U014| 39|
|   U015| 45|
|   U016| 17|
|   U017| 36|
|   U018| 48|
|   U019| 27|
|   U020| 37|
|   U021| 40|
|   U022| 42|
|   U023| 43|
|   U024| 31|
|   U025| 30|
|   U026| 18|
|   U027| 22|
|   U028| 36|
|   U029| 24|
|   U030| 33|
|   U031| 19|
|   U032| 40|
|   U033| 29|
|   U034| 24|
|   U035| 41|
|   U036| 17|
|   U037| 35|
|   U038| 43|
|   U039| 22|
|   U040| 23|
|   U041| 29|
|   U042| 32|
|   U043| 19|
|   U044| 17|
|   U045| 21|
|   U046| 19|
|   U047| 44|
|   U048| 33|
|   U049| 41|
|   U050| 49|
|   U051| 25|
|   U052| 29|
|   U053| 46|
|   U054| 30|
|   U055| 23|
|   U056| 29|
|   U057| 38|
|   U058| 36|
|   U059| 31|
|   U060| 33|
|   U061| 39|
|   U062| 41|
|   U063| 40|
|   U064| 44|
|   U065| 30|
|   U066| 16|
|   U067| 40|
|   U068| 22|
|   U0

In [14]:
from pyspark.sql.functions import col

# Example: users with screen time > 5 hrs
df_appended.filter(col("Daily_Screen_Time") > 5).show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U002| 30| Other|              5.1|          7.0|         8.0|                      5.0|               3.0|             LinkedIn|           10.0|
|   U003| 23| Other|              7.4|          6.0|         7.0|                      1.0|               3.0|              YouTube|            6.0|
|   U004| 36|Female|              5.7|          7.0|         8.0|                      1.0|               1.0|               TikTok|            8.0|
|   U005| 34|Female|              7.0|          4.0|         7.0|                      5.0|               

In [15]:
from pyspark.sql.functions import avg

df_appended.groupBy("Gender").agg(avg("Daily_Screen_Time")).show()


+------+----------------------+
|Gender|avg(Daily_Screen_Time)|
+------+----------------------+
|Female|     5.512227074235809|
| Other|     5.065217391304347|
|  Male|     5.575100401606428|
+------+----------------------+



In [18]:
from pyspark.sql.functions import when, col

df_updated = df.withColumn(
    "Happiness_Index",
    when(col("Social_Media_Platform") == "X (Twitter)", col("Happiness_Index") + 1)
    .otherwise(col("Happiness_Index"))
)

df_updated.show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U001| 44|  Male|              3.1|          7.0|         6.0|                      2.0|               5.0|             Facebook|           10.0|
|   U002| 30| Other|              5.1|          7.0|         8.0|                      5.0|               3.0|             LinkedIn|           10.0|
|   U003| 23| Other|              7.4|          6.0|         7.0|                      1.0|               3.0|              YouTube|            6.0|
|   U004| 36|Female|              5.7|          7.0|         8.0|                      1.0|               

In [19]:
df_deleted = df.filter(col("Age") >= 30)
df_deleted.show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U001| 44|  Male|              3.1|          7.0|         6.0|                      2.0|               5.0|             Facebook|           10.0|
|   U002| 30| Other|              5.1|          7.0|         8.0|                      5.0|               3.0|             LinkedIn|           10.0|
|   U004| 36|Female|              5.7|          7.0|         8.0|                      1.0|               1.0|               TikTok|            8.0|
|   U005| 34|Female|              7.0|          4.0|         7.0|                      5.0|               

In [20]:
df_updated.write.mode("overwrite").parquet("mental_health.parquet")


In [21]:
df_updated.count()

500

In [22]:
df.count()

500

In [23]:
df_deleted.count()

306

In [47]:
## df.orderBy("Age").show()
df.orderBy(col("Age").desc()).show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U050| 49|  Male|              3.6|          7.0|         5.0|                      4.0|               1.0|               TikTok|            9.0|
|   U084| 49|  Male|              7.5|          4.0|         7.0|                      2.0|               4.0|          X (Twitter)|            8.0|
|   U161| 49|  Male|              5.1|          5.0|         7.0|                      6.0|               3.0|              YouTube|            8.0|
|   U233| 49|  Male|              6.2|          6.0|         8.0|                      1.0|               

In [25]:
df.filter((col("Age") > 25) & (col("Gender") == "Male")).show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U001| 44|  Male|              3.1|          7.0|         6.0|                      2.0|               5.0|             Facebook|           10.0|
|   U006| 38|  Male|              6.6|          5.0|         7.0|                      4.0|               3.0|             LinkedIn|            8.0|
|   U009| 39|  Male|              4.7|          7.0|         7.0|                      6.0|               1.0|              YouTube|            9.0|
|   U015| 45|  Male|              6.3|          7.0|         7.0|                      4.0|               

In [26]:
from pyspark.sql.functions import avg, max, min

df.groupBy("Gender").agg(
    avg("Daily_Screen_Time"),
    max("Sleep_Quality"),
    min("Stress_Level")
).show()


+------+----------------------+------------------+-----------------+
|Gender|avg(Daily_Screen_Time)|max(Sleep_Quality)|min(Stress_Level)|
+------+----------------------+------------------+-----------------+
|Female|     5.512227074235809|              10.0|              2.0|
| Other|     5.065217391304347|               9.0|              3.0|
|  Male|      5.58951612903226|              10.0|              3.0|
+------+----------------------+------------------+-----------------+



In [27]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

w = Window.partitionBy(col("Gender")).orderBy(col("Daily_Screen_Time").desc())

df_ranked = df.withColumn("Rank", rank().over(w))
df_ranked.show()


+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+----+
|User_ID|Age|Gender|Daily_Screen_Time|Sleep_Quality|Stress_Level|Days_Without_Social_Media|Exercise_Frequency|Social_Media_Platform|Happiness_Index|Rank|
+-------+---+------+-----------------+-------------+------------+-------------------------+------------------+---------------------+---------------+----+
|   U249| 46|Female|             10.8|          5.0|        10.0|                      2.0|               3.0|            Instagram|            4.0|   1|
|   U203| 48|Female|             10.0|          3.0|        10.0|                      3.0|               2.0|               TikTok|            4.0|   2|
|   U213| 17|Female|              9.8|          4.0|         9.0|                      5.0|               0.0|             LinkedIn|            6.0|   3|
|   U057| 38|Female|              9.7|          3.0|         9.0|           

In [28]:
df.groupBy(col("Gender")).pivot("Social_Media_Platform").avg("Stress_Level").show()


+------+-----------------+------------------+------------------+------------------+-----------------+-----------------+
|Gender|         Facebook|         Instagram|          LinkedIn|            TikTok|      X (Twitter)|          YouTube|
+------+-----------------+------------------+------------------+------------------+-----------------+-----------------+
|Female|6.630434782608695| 6.777777777777778|6.4772727272727275|6.5227272727272725|6.583333333333333|          6.71875|
| Other|              6.0| 7.666666666666667|               7.5| 7.333333333333333|              5.5|6.333333333333333|
|  Male|6.735294117647059|7.0227272727272725| 6.333333333333333| 6.604166666666667|6.478260869565218|6.621621621621622|
+------+-----------------+------------------+------------------+------------------+-----------------+-----------------+

