In [7]:
import pandas as pd
import numpy as np

# Load the dataset (replace the path with your file path)
df = pd.read_csv('social_media_entertainment_data.csv')

# Display the first few rows
print(df.head())

# Get an overview of the dataset
print(df.info())

   User ID  Age  Gender  Country  Daily Social Media Time (hrs)  \
0        1   32   Other  Germany                           4.35   
1        2   62   Other    India                           4.96   
2        3   51  Female      USA                           6.78   
3        4   44  Female    India                           5.06   
4        5   21   Other  Germany                           2.57   

   Daily Entertainment Time (hrs)  Social Media Platforms Used  \
0                            4.08                            5   
1                            4.21                            2   
2                            1.77                            4   
3                            9.21                            3   
4                            1.30                            4   

  Primary Platform  Daily Messaging Time (hrs)  \
0           TikTok                        0.35   
1          YouTube                        2.55   
2         Facebook                        2.09   


In [3]:
# Step 1: Check for missing values and duplicates

# Check for missing values
missing_values = df.isnull().sum()

# Check for duplicates
duplicates = df[df.duplicated()]

print(f"Missing Values:\n{missing_values}")
print(f"Duplicate Rows:\n{duplicates}")

Missing Values:
User ID                                       0
Age                                           0
Gender                                        0
Country                                       0
Daily Social Media Time (hrs)                 0
Daily Entertainment Time (hrs)                0
Social Media Platforms Used                   0
Primary Platform                              0
Daily Messaging Time (hrs)                    0
Daily Video Content Time (hrs)                0
Daily Gaming Time (hrs)                       0
Occupation                                    0
Marital Status                                0
Monthly Income (USD)                          0
Device Type                                   0
Internet Speed (Mbps)                         0
Subscription Platforms                        0
Average Sleep Time (hrs)                      0
Physical Activity Time (hrs)                  0
Reading Time (hrs)                            0
Work/Study Time (hrs)   

In [11]:
# Step 2: Rename columns for consistency

df.rename(columns={
    'Monthly Expenditure on Entertainment (USD)': 'Monthly_Expenditure_Entertainment',
    'Primary Social Media Goal': 'Primary_Social_Media_Goal',
    'Social Media Platforms Used': 'Social_Media_Platforms_Used',
    'Preferred Device for Entertainment': 'Preferred_Device_Entertainment',
    'Time on Educational Platforms (hrs)': 'Time_Educational_Platforms',
    'Physical Activity Time (hrs)': 'Physical_Activity_Time',
    'Work/Study Time (hrs)': 'Work_Study_Time',
    'Screen Time (hrs)': 'Screen_Time',
    'Marital Status': 'Marital_Status',
    'Notifications Received Daily': 'Notifications_Received',
    'Social Media Fatigue Level (scale 1-10)': 'Social_Media_Fatigue',
    'Tech Savviness Level (scale 1-10)': 'Tech_Savviness_Level',
    'Social Isolation Feeling (scale 1-10)': 'Social_Isolation_Feelings',
    'Average Sleep Time (hrs)': 'Average_Sleep_Time',
    'Daily Social Media Time (hrs)': 'Daily_Social_Media_Time',
    'Daily Entertainment Time (hrs)': 'Daily_Entertainment_Time',
    'Daily Gaming Time (hrs)': 'Daily_Gaming_Time',
    'Daily Video Content Time (hrs)': 'Daily_Video_Content_Time',
    'Daily Messaging Time (hrs)': 'Daily_Messaging_Time',
    'User ID': 'User_ID',
    'Subscription Platforms': 'Subscription_Platforms',
    'Device Type': 'Device_Type',
    'Primary Platform': 'Primary_Platform',
    'Monthly Income (USD)': 'Monthly_Income',
    'Reading Time (hrs)': 'Reading_Time',
    'Time Spent in Online Communities (hrs)': 'Time_Online_Communities',
    'Ad Interaction Count': 'Ad_Interaction_Count',
    'News Consumption Time (hrs)': 'News_Consumption_Time',
    'Internet Speed (Mbps)': 'Internet_Speed',
    'Daily Music Listening Time (hrs)': 'Daily_Music_Listening_Time',
    'Preferred Content Type': 'Preferred_Content_Type',
    'Preferred Entertainment Platform': 'Preferred_Entertainment_Platform',
    'Parental Status': 'Parental_Status',
    'Data Plan Used': 'Data_Plan_Used',
    'Digital Well-being Awareness': 'Digital_Well_Being_Awareness',
    'Sleep Quality (scale 1-10)': 'Sleep_Quality',
}, inplace=True)

In [13]:
print(df.head())

   User_ID  Age  Gender  Country  Daily_Social_Media_Time  \
0        1   32   Other  Germany                     4.35   
1        2   62   Other    India                     4.96   
2        3   51  Female      USA                     6.78   
3        4   44  Female    India                     5.06   
4        5   21   Other  Germany                     2.57   

   Daily_Entertainment_Time  Social_Media_Platforms_Used Primary_Platform  \
0                      4.08                            5           TikTok   
1                      4.21                            2          YouTube   
2                      1.77                            4         Facebook   
3                      9.21                            3          YouTube   
4                      1.30                            4           TikTok   

   Daily_Messaging_Time  Daily_Video_Content_Time  ...  Ad_Interaction_Count  \
0                  0.35                      5.43  ...                    20   
1         

In [15]:
# Step 3: Check and Validate Data Types

print(f"Original Data Types:\n{df.dtypes}")


Original Data Types:
User_ID                                int64
Age                                    int64
Gender                                object
Country                               object
Daily_Social_Media_Time              float64
Daily_Entertainment_Time             float64
Social_Media_Platforms_Used            int64
Primary_Platform                      object
Daily_Messaging_Time                 float64
Daily_Video_Content_Time             float64
Daily_Gaming_Time                    float64
Occupation                            object
Marital_Status                        object
Monthly_Income                       float64
Device_Type                           object
Internet_Speed                       float64
Subscription_Platforms                 int64
Average_Sleep_Time                   float64
Physical_Activity_Time               float64
Reading_Time                         float64
Work_Study_Time                      float64
Screen_Time                       

In [17]:
# Step 4: Perform Exploratory Data Analysis (EDA)

# Summary statistics for numerical columns
summary_stats = df.describe()

# Count unique values for categorical columns
unique_values = df.select_dtypes(include=['object']).nunique()

print(f"Summary Statistics:\n{summary_stats}")
print(f"Unique Values in Categorical Columns:\n{unique_values}")

Summary Statistics:
             User_ID            Age  Daily_Social_Media_Time  \
count  300000.000000  300000.000000            300000.000000   
mean   150000.500000      38.530547                 4.254808   
std     86602.684716      15.005038                 2.165604   
min         1.000000      13.000000                 0.500000   
25%     75000.750000      26.000000                 2.380000   
50%    150000.500000      39.000000                 4.260000   
75%    225000.250000      52.000000                 6.130000   
max    300000.000000      65.000000                 8.000000   

       Daily_Entertainment_Time  Social_Media_Platforms_Used  \
count             300000.000000                300000.000000   
mean                   5.244080                     3.000143   
std                    2.741804                     1.413682   
min                    0.500000                     1.000000   
25%                    2.870000                     2.000000   
50%                

In [19]:
# Step 5: Create new columns for easier analysis

# 1. Online Activity Share (percentage of screen time spent on video, gaming, and messaging)
df['Online_Activity_Share'] = (df['Daily_Video_Content_Time'] + df['Daily_Gaming_Time'] + df['Daily_Messaging_Time']) / df['Screen_Time'] * 100

# 2. High/Low Tech-Savviness (categorize Tech Savviness Level into Low and High)
df['High_Low_Tech_Savviness'] = df['Tech_Savviness_Level'].apply(lambda x: 'High' if x >= 6 else 'Low')

# 3. Work-to-Entertainment Ratio (how much time is spent on work/study vs entertainment)
df['Work_to_Entertainment_Ratio'] = df['Work_Study_Time'] / df['Daily_Entertainment_Time']

# View the new columns added
print(f"New Columns:\n{df[['Online_Activity_Share', 'High_Low_Tech_Savviness', 'Work_to_Entertainment_Ratio']].head()}")

New Columns:
   Online_Activity_Share High_Low_Tech_Savviness  Work_to_Entertainment_Ratio
0             142.701228                    High                     0.492647
1             169.739953                    High                     0.681710
2             169.887640                    High                     2.209040
3             115.839861                    High                     0.206298
4             155.602241                    High                     2.053846


In [21]:
df.drop(columns=['Online_Activity_Share'], inplace=True)

In [23]:
print(df.head())

   User_ID  Age  Gender  Country  Daily_Social_Media_Time  \
0        1   32   Other  Germany                     4.35   
1        2   62   Other    India                     4.96   
2        3   51  Female      USA                     6.78   
3        4   44  Female    India                     5.06   
4        5   21   Other  Germany                     2.57   

   Daily_Entertainment_Time  Social_Media_Platforms_Used Primary_Platform  \
0                      4.08                            5           TikTok   
1                      4.21                            2          YouTube   
2                      1.77                            4         Facebook   
3                      9.21                            3          YouTube   
4                      1.30                            4           TikTok   

   Daily_Messaging_Time  Daily_Video_Content_Time  ...  Parental_Status  \
0                  0.35                      5.43  ...              Yes   
1                  2

In [25]:
# Step 6: Check for missing values after transformations

missing_values_after_transformation = df.isnull().sum()
print(f"Missing Values After Transformation:\n{missing_values_after_transformation}")

Missing Values After Transformation:
User_ID                              0
Age                                  0
Gender                               0
Country                              0
Daily_Social_Media_Time              0
Daily_Entertainment_Time             0
Social_Media_Platforms_Used          0
Primary_Platform                     0
Daily_Messaging_Time                 0
Daily_Video_Content_Time             0
Daily_Gaming_Time                    0
Occupation                           0
Marital_Status                       0
Monthly_Income                       0
Device_Type                          0
Internet_Speed                       0
Subscription_Platforms               0
Average_Sleep_Time                   0
Physical_Activity_Time               0
Reading_Time                         0
Work_Study_Time                      0
Screen_Time                          0
Notifications_Received               0
Daily_Music_Listening_Time           0
Preferred_Content_Type     

In [27]:
# Save the cleaned dataframe
df.to_csv('cleaned_social_media_dataset.csv', index=False)