### Reading the dataset

In [0]:
df = spark.read.table("workspace.default.indian_kids_screen_time")
df.display()


Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
15,Female,1.21,Laptop,False,0.39,,Urban
12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban
14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban
17,Male,2.97,TV,False,0.48,,Rural
10,Male,2.74,TV,True,0.54,,Urban
14,Male,4.61,Laptop,True,0.36,"Poor Sleep, Anxiety",Rural
18,Male,3.24,Tablet,True,0.48,"Poor Sleep, Obesity Risk",Urban


### Read the schema

In [0]:

df = spark.read.table("workspace.default.indian_kids_screen_time")
df.printSchema()


root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Avg_Daily_Screen_Time_hr: double (nullable = true)
 |-- Primary_Device: string (nullable = true)
 |-- Exceeded_Recommended_Limit: boolean (nullable = true)
 |-- Educational_to_Recreational_Ratio: double (nullable = true)
 |-- Health_Impacts: string (nullable = true)
 |-- Urban_or_Rural: string (nullable = true)



### Summary Statistics for numeric columns

In [0]:

df.describe().display()


numeric_cols = ["Age", "Avg_Daily_Screen_Time_hr", "Educational_to_Recreational_Ratio"]
df.select(numeric_cols).summary("mean", "stddev", "min", "max").display()


summary,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
count,9712.0,9712,9712.0,9712,9712.0,9712,9712
mean,12.979200988467875,,4.352836696869851,,0.4272261120263516,,
stddev,3.16243719631936,,1.7182324590516287,,0.0732212412185449,,
min,8.0,Female,0.0,Laptop,0.3,Anxiety,Rural
max,18.0,Male,13.89,Tablet,0.6,"Poor Sleep, Obesity Risk",Urban


summary,Age,Avg_Daily_Screen_Time_hr,Educational_to_Recreational_Ratio
mean,12.979200988467875,4.352836696869851,0.4272261120263615
stddev,3.16243719631936,1.7182324590516287,0.0732212412185449
min,8.0,0.0,0.3
max,18.0,13.89,0.6


### Count null values in columns

In [0]:
from pyspark.sql.functions import col, count, when

null_counts = df.select([
    count(when(col(c).isNull(), c)).alias(c) for c in df.columns
])

null_counts.display()


Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,0,0,0,0,0,0,0


### Daily Screen Time Distribution by Gender and Location (Urban/Rural)

In [0]:
from pyspark.sql.functions import avg, stddev, min, max, count

df.groupBy("Gender", "Urban_or_Rural").agg(
    count("*").alias("Count"),
    avg("Avg_Daily_Screen_Time_hr").alias("Mean_Screen_Time"),
    stddev("Avg_Daily_Screen_Time_hr").alias("StdDev_Screen_Time"),
    min("Avg_Daily_Screen_Time_hr").alias("Min_Screen_Time"),
    max("Avg_Daily_Screen_Time_hr").alias("Max_Screen_Time")
).orderBy("Gender", "Urban_or_Rural").show()


+------+--------------+-----+------------------+------------------+---------------+---------------+
|Gender|Urban_or_Rural|Count|  Mean_Screen_Time|StdDev_Screen_Time|Min_Screen_Time|Max_Screen_Time|
+------+--------------+-----+------------------+------------------+---------------+---------------+
|Female|         Rural| 1410|4.3664397163120565|1.7718825776467186|            0.0|          11.26|
|Female|         Urban| 3360| 4.310479166666675|1.7324343814815126|            0.0|          13.89|
|  Male|         Rural| 1451| 4.380758097863537|1.7034191105745486|            0.0|           12.8|
|  Male|         Urban| 3491| 4.376505299341169|1.6883336730607552|            0.0|           12.4|
+------+--------------+-----+------------------+------------------+---------------+---------------+



In [0]:
df.display()



Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
15,Female,1.21,Laptop,False,0.39,,Urban
12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban
14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban
17,Male,2.97,TV,False,0.48,,Rural
10,Male,2.74,TV,True,0.54,,Urban
14,Male,4.61,Laptop,True,0.36,"Poor Sleep, Anxiety",Rural
18,Male,3.24,Tablet,True,0.48,"Poor Sleep, Obesity Risk",Urban


Databricks visualization. Run in Databricks to view.

### User count for each primary device

In [0]:
from pyspark.sql.functions import count

df.groupBy("Primary_Device") \
  .agg(count("*").alias("User_Count")) \
  .orderBy("User_Count", ascending=False) \
  .display()


Primary_Device,User_Count
Smartphone,4568
TV,2487
Laptop,1433
Tablet,1224


Databricks visualization. Run in Databricks to view.

### Age-wise User Distribution

In [0]:
# Age distribution buckets
df.select("Age").groupBy("Age").count().orderBy("Age").display()



Age,count
8,912
9,885
10,877
11,866
12,867
13,910
14,896
15,864
16,876
17,919


Databricks visualization. Run in Databricks to view.

### Screen time distribution (rounded)

In [0]:

from pyspark.sql.functions import round
df.withColumn("Rounded_Screen_Time", round("Avg_Daily_Screen_Time_hr", 0)) \
  .groupBy("Rounded_Screen_Time").count().orderBy("Rounded_Screen_Time").display()


Rounded_Screen_Time,count
0.0,326
1.0,251
2.0,627
3.0,1385
4.0,2415
5.0,2496
6.0,1393
7.0,523
8.0,179
9.0,68


Databricks visualization. Run in Databricks to view.

### Correlation between screen time and education/recreation ratio

In [0]:

corr_val = df.stat.corr("Avg_Daily_Screen_Time_hr", "Educational_to_Recreational_Ratio")
print(f"Correlation between screen time and edu/recreational ratio: {corr_val}")


Correlation between screen time and edu/recreational ratio: -0.08755222315597079


### Average Screen Time by Gender 

In [0]:
from pyspark.sql.functions import avg

df.groupBy("Gender") \
  .agg(avg("Avg_Daily_Screen_Time_hr").alias("Avg_Screen_Time")) \
  .orderBy("Gender") \
  .display()


Gender,Avg_Screen_Time
Female,4.327020964360584
Male,4.377753945770932


Databricks visualization. Run in Databricks to view.

### Average Educational to Recreational Screen Time Ratio by Area Type (Urban vs Rural)

In [0]:
df.groupBy("Urban_or_Rural") \
  .agg(avg("Educational_to_Recreational_Ratio").alias("Avg_EdRec_Ratio")) \
  .orderBy("Urban_or_Rural") \
  .display()


Urban_or_Rural,Avg_EdRec_Ratio
Rural,0.4278643830828416
Urban,0.4269595679462812


### Screen Time Statistics by Primary Device Used

In [0]:
from pyspark.sql.functions import min, max, avg, stddev

df.groupBy("Primary_Device").agg(
    count("*").alias("Total_Users"),
    avg("Avg_Daily_Screen_Time_hr").alias("Avg_Screen_Time"),
    stddev("Avg_Daily_Screen_Time_hr").alias("StdDev_Screen_Time"),
    min("Avg_Daily_Screen_Time_hr").alias("Min_Screen_Time"),
    max("Avg_Daily_Screen_Time_hr").alias("Max_Screen_Time")
).orderBy("Total_Users", ascending=False).show()


+--------------+-----------+-----------------+------------------+---------------+---------------+
|Primary_Device|Total_Users|  Avg_Screen_Time|StdDev_Screen_Time|Min_Screen_Time|Max_Screen_Time|
+--------------+-----------+-----------------+------------------+---------------+---------------+
|    Smartphone|       4568|  4.3889251313485|1.6518449467596912|            0.0|          13.89|
|            TV|       2487| 4.28775231202252| 1.923680596559661|            0.0|           12.8|
|        Laptop|       1433| 4.45908583391486|1.2050619789927763|           0.32|           8.27|
|        Tablet|       1224|4.226004901960775| 1.996952658798013|            0.0|          12.09|
+--------------+-----------+-----------------+------------------+---------------+---------------+

