In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("CSV to Table") \
    .getOrCreate()

print("âœ… Fresh Spark session created")

âœ… Fresh Spark session created


In [2]:
csv_file = "myFiles/Mental_Health_and_Social_Media_Balance_Dataset.csv"

print(f"ðŸ“‚ Reading: {csv_file}")

# Read CSV directly with Spark
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(csv_file)

print(f"âœ… CSV read successfully!")
print(f"Rows: {df.count()}, Columns: {len(df.columns)}")

# Show what we have
print("\nðŸ“‹ Original column names:")
for col in df.columns:
    print(f"  - {col}")

print("\nðŸ‘€ First 3 rows:")
df.show(3, truncate=False)

ðŸ“‚ Reading: myFiles/Mental_Health_and_Social_Media_Balance_Dataset.csv
âœ… CSV read successfully!
Rows: 500, Columns: 10

ðŸ“‹ Original column names:
  - User_ID
  - Age
  - Gender
  - Daily_Screen_Time(hrs)
  - Sleep_Quality(1-10)
  - Stress_Level(1-10)
  - Days_Without_Social_Media
  - Exercise_Frequency(week)
  - Social_Media_Platform
  - Happiness_Index(1-10)

ðŸ‘€ First 3 rows:
+-------+---+------+----------------------+-------------------+------------------+-------------------------+------------------------+---------------------+---------------------+
|User_ID|Age|Gender|Daily_Screen_Time(hrs)|Sleep_Quality(1-10)|Stress_Level(1-10)|Days_Without_Social_Media|Exercise_Frequency(week)|Social_Media_Platform|Happiness_Index(1-10)|
+-------+---+------+----------------------+-------------------+------------------+-------------------------+------------------------+---------------------+---------------------+
|U001   |44 |Male  |3.1                   |7.0                |6.0            

In [3]:
print("\nðŸ§¹ Cleaning column names...")

# Clean columns with parentheses
column_mapping = {
    "Daily_Screen_Time(hrs)": "daily_screen_time_hrs",
    "Sleep_Quality(1-10)": "sleep_quality",
    "Stress_Level(1-10)": "stress_level",
    "Exercise_Frequency(week)": "exercise_frequency",
    "Happiness_Index(1-10)": "happiness_index"
}


ðŸ§¹ Cleaning column names...


In [4]:
for old_name, new_name in column_mapping.items():
    if old_name in df.columns:
        df = df.withColumnRenamed(old_name, new_name)
        print(f"  âœ“ {old_name} â†’ {new_name}")

print("\nâœ… Final column names:")
print(df.columns)

  âœ“ Daily_Screen_Time(hrs) â†’ daily_screen_time_hrs
  âœ“ Sleep_Quality(1-10) â†’ sleep_quality
  âœ“ Stress_Level(1-10) â†’ stress_level
  âœ“ Exercise_Frequency(week) â†’ exercise_frequency
  âœ“ Happiness_Index(1-10) â†’ happiness_index

âœ… Final column names:
['User_ID', 'Age', 'Gender', 'daily_screen_time_hrs', 'sleep_quality', 'stress_level', 'Days_Without_Social_Media', 'exercise_frequency', 'Social_Media_Platform', 'happiness_index']


In [5]:
table_name = "social_media_analysis"

print(f"\nðŸ’¾ Creating table: {table_name}")

# Method 1: Direct saveAsTable (uses default format)
df.write \
    .mode("overwrite") \
    .saveAsTable(table_name)

print(f"âœ… Table '{table_name}' created successfully!")


ðŸ’¾ Creating table: social_media_analysis
âœ… Table 'social_media_analysis' created successfully!


In [6]:
print("\n" + "="*50)
print("VERIFICATION")
print("="*50)



VERIFICATION


In [7]:
# 1. Show tables
print("\n1. All tables in database:")
spark.sql("SHOW TABLES").show()


1. All tables in database:
+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|social_media_anal...|      false|
+---------+--------------------+-----------+



In [8]:
print(f"\n2. Record count in '{table_name}':")
spark.sql(f"SELECT COUNT(*) as total_rows FROM {table_name}").show()


2. Record count in 'social_media_analysis':
+----------+
|total_rows|
+----------+
|       500|
+----------+



In [9]:
print(f"\n3. Sample data from '{table_name}':")
spark.sql(f"SELECT * FROM {table_name} LIMIT 5").show()


3. Sample data from 'social_media_analysis':
+-------+---+------+---------------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|User_ID|Age|Gender|daily_screen_time_hrs|sleep_quality|stress_level|Days_Without_Social_Media|exercise_frequency|Social_Media_Platform|happiness_index|
+-------+---+------+---------------------+-------------+------------+-------------------------+------------------+---------------------+---------------+
|   U001| 44|  Male|                  3.1|          7.0|         6.0|                      2.0|               5.0|             Facebook|           10.0|
|   U002| 30| Other|                  5.1|          7.0|         8.0|                      5.0|               3.0|             LinkedIn|           10.0|
|   U003| 23| Other|                  7.4|          6.0|         7.0|                      1.0|               3.0|              YouTube|            6.0|
|   U004| 36|Female|                

In [10]:
print(f"\n4. Table schema:")
spark.sql(f"DESCRIBE {table_name}").show()


4. Table schema:
+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|             User_ID|   string|   NULL|
|                 Age|      int|   NULL|
|              Gender|   string|   NULL|
|daily_screen_time...|   double|   NULL|
|       sleep_quality|   double|   NULL|
|        stress_level|   double|   NULL|
|Days_Without_Soci...|   double|   NULL|
|  exercise_frequency|   double|   NULL|
|Social_Media_Plat...|   string|   NULL|
|     happiness_index|   double|   NULL|
+--------------------+---------+-------+



In [11]:
print(f"\n5. Analysis query:")
spark.sql(f"""
    SELECT 
        Gender,
        Social_Media_Platform,
        COUNT(*) as user_count,
        ROUND(AVG(daily_screen_time_hrs), 2) as avg_screen_time,
        ROUND(AVG(happiness_index), 2) as avg_happiness
    FROM {table_name}
    GROUP BY Gender, Social_Media_Platform
    ORDER BY avg_happiness DESC
    LIMIT 10
""").show()

print("\nðŸŽ‰ SUCCESS! Your CSV is now a Spark table ready for queries!")


5. Analysis query:
+------+---------------------+----------+---------------+-------------+
|Gender|Social_Media_Platform|user_count|avg_screen_time|avg_happiness|
+------+---------------------+----------+---------------+-------------+
| Other|          X (Twitter)|         6|           3.78|         9.33|
| Other|             Facebook|         1|            4.6|          9.0|
| Other|             LinkedIn|         4|           5.68|          9.0|
| Other|              YouTube|         6|           4.82|         8.83|
|Female|          X (Twitter)|        36|           5.12|         8.67|
|  Male|             Facebook|        34|           5.52|         8.56|
|Female|             LinkedIn|        44|           5.16|         8.55|
|  Male|          X (Twitter)|        46|           5.65|         8.54|
|  Male|               TikTok|        48|           5.46|         8.48|
|  Male|             LinkedIn|        39|           5.41|         8.44|
+------+---------------------+----------+---