<a href="https://colab.research.google.com/github/SuvoneathSvay/Mental-Wellness-vs-Screentime/blob/main/Mental_Wellness_vs_Screen_Time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#import modelings
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer



In [None]:
#Load data
DATA_PATH = '/content/ScreenTime vs MentalWellness.csv'

In [None]:
df = pd.read_csv(DATA_PATH)
print('Rows, Columns:', df.shape)
print(df.info())

Rows, Columns: (400, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   user_id                      400 non-null    object 
 1   age                          400 non-null    int64  
 2   gender                       400 non-null    object 
 3   occupation                   400 non-null    object 
 4   work_mode                    400 non-null    object 
 5   screen_time_hours            400 non-null    float64
 6   work_screen_hours            400 non-null    float64
 7   leisure_screen_hours         400 non-null    float64
 8   sleep_hours                  400 non-null    float64
 9   sleep_quality_1_5            400 non-null    int64  
 10  stress_level_0_10            400 non-null    float64
 11  productivity_0_100           400 non-null    float64
 12  exercise_minutes_per_week    400 non-null    int64  


In [None]:
#cell3: Cleaning
for col in ['gender','occupation','work_mode']:
    df[col] = df[col].astype('category')
print('\nMissing values per column:')
df = df.dropna(axis=1, how='all')

print(df.isna().sum())

num_cols = df.select_dtypes(include=['float64','int64']).columns.tolist()
cat_cols = df.select_dtypes(include=['category','object']).columns.tolist()
imp_num = SimpleImputer(strategy='median')
if df[num_cols].isna().sum().sum() > 0:
    df[num_cols] = imp_num.fit_transform(df[num_cols])



In [None]:
#cell4
df['high_stress'] = (df['stress_level_0_10'] > 6).astype(int)
bins = [0,4,7,10,24]
labels = ['Low','Moderate','High','Very High']
df['screen_time_cat'] = pd.cut(df['screen_time_hours'], bins=bins, labels=labels)

df['lifestyle_score'] = (
    (df['exercise_minutes_per_week'] / df['exercise_minutes_per_week'].max()) * 40 +
    (df['social_hours_per_week'] / df['social_hours_per_week'].max()) * 20 +
    (df['sleep_quality_1_5'] / 5.0) * 40
)

In [None]:
#cell 5 visualizations

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df['screen_time_hours'], kde=True)
plt.title('Screen time (hours/day)')

plt.subplot(1,2,2)
sns.histplot(df['mental_wellness_index_0_100'], kde=True)
plt.title('Mental Wellness Index')
plt.tight_layout()
plt.show()

# Boxplots by occupation
plt.figure(figsize=(10,5))
sns.boxplot(x='occupation', y='screen_time_hours', data=df)
plt.title('Screen time by Occupation')
plt.show()


In [None]:
#cell 6 relationships plots

plt.figure(figsize=(6,5))
sns.scatterplot(x='screen_time_hours', y='mental_wellness_index_0_100', hue='gender', data=df)
plt.title('Screen time vs Mental Wellness')
plt.show()

# Sleep vs Screen time
plt.figure(figsize=(6,5))
sns.scatterplot(x='screen_time_hours', y='sleep_hours', data=df)
plt.title('Screen time vs Sleep hours')
plt.show()

# Correlation heatmap (numeric features)
plt.figure(figsize=(10,8))
num_df = df.select_dtypes(include=['float64','int64'])
cm = num_df.corr()
sns.heatmap(cm, annot=True, fmt='.2f', cmap='vlag')
plt.title('Correlation matrix')
plt.show()

In [None]:
# cell 7: statistical checks
#correlatioon between screen time and mental wellness

from scipy.stats import pearsonr, spearmanr
pearson = pearsonr(df['screen_time_hours'], df['mental_wellness_index_0_100'])
spearman = spearmanr(df['screen_time_hours'], df['mental_wellness_index_0_100'])
print('Pearson r:', pearson)
print('Spearman rho:', spearman)

# Compare wellness across screen_time_cat
plt.figure(figsize=(8,5))
sns.boxplot(x='screen_time_cat', y='mental_wellness_index_0_100', data=df, order=labels)
plt.title('Wellness by Screen Time Category')
plt.show()

In [None]:
#cell 8: clustering

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

cluster_features = ['screen_time_hours','sleep_hours','stress_level_0_10','mental_wellness_index_0_100','productivity_0_100']
Xc = df[cluster_features].dropna()
scaler = StandardScaler()
Xc_s = scaler.fit_transform(Xc)

# Elbow Method
inertia = []
K = range(1,7)
for k in K:
    km = KMeans(n_clusters=k, random_state=42).fit(Xc_s)
    inertia.append(km.inertia_)

plt.figure(figsize=(6,4))
plt.plot(K, inertia, '-o')
plt.xlabel('k')
plt.ylabel('inertia')
plt.title('Elbow Method for KMeans')
plt.show()

# Fit KMeans with k=3
kmeans = KMeans(n_clusters=3, random_state=42).fit(Xc_s)
df['cluster'] = kmeans.predict(scaler.transform(df[cluster_features]))
plt.figure(figsize=(8,6))
sns.scatterplot(x='screen_time_hours', y='mental_wellness_index_0_100', hue='cluster', data=df, palette='deep')
plt.title('Clusters: Screen time vs Wellness')
plt.show()

# Cluster centers (rescaled)
centers = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centers, columns=cluster_features)
print('Cluster centers:')
print(centers_df)

In [None]:
# cell 9: regression

features = ['age','screen_time_hours','sleep_hours','sleep_quality_1_5','stress_level_0_10','productivity_0_100','exercise_minutes_per_week','social_hours_per_week']
X = df[features]
y = df['mental_wellness_index_0_100']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print('\nLinear Regression Metrics:')
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print('MAE:', mean_absolute_error(y_test, y_pred_lr))
print('R2:', r2_score(y_test, y_pred_lr))

# Model 2: Random Forest Regressor
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('\nRandom Forest Metrics:')
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print('MAE:', mean_absolute_error(y_test, y_pred_rf))
print('R2:', r2_score(y_test, y_pred_rf))

# Model 3: Haven't figured out yet



# Feature importances
fi = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print('\nFeature importances (RF):')
print(fi)

plt.figure(figsize=(8,4))
fi.plot(kind='bar')
plt.title('Feature importances')
plt.show()

In [None]:
#cell 10: baselines classification/ predict high-stress

clf_features = ['age','screen_time_hours','sleep_hours','sleep_quality_1_5','productivity_0_100','exercise_minutes_per_week']
Xc = df[clf_features]
yc = df['high_stress']
Xtr, Xte, ytr, yte = train_test_split(Xc, yc, test_size=0.2, random_state=42)

# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=200, random_state=42)
rfc.fit(Xtr, ytr)
yp = rfc.predict(Xte)
yp_prob = rfc.predict_proba(Xte)[:,1]

print('\nRandom Forest Classifier Report:')
print(classification_report(yte, yp))
print('Accuracy:', accuracy_score(yte, yp))
try:
    print('ROC AUC:', roc_auc_score(yte, yp_prob))
except Exception:
    pass

# Feature importance for classifier
fi_clf = pd.Series(rfc.feature_importances_, index=clf_features).sort_values(ascending=False)
print('\nClassifier Feature importances:')
print(fi_clf)

plt.figure(figsize=(8,4))
fi_clf.plot(kind='bar')
plt.title('Classifier Feature importances')
plt.show()