In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Load the data
df_ml = pd.read_csv('../data/cleaned_data/student_social_media_addiction_cleaned.csv')

In [None]:
# Check column names to assign them as numerical or categorical
df_ml.columns

Index(['new_student_id', 'age', 'gender', 'relationship_status',
       'academic_level', 'country', 'continents', 'most_used_platform',
       'average_daily_usage_minutes', 'average_daily_use_by_hour',
       'sleep_per_night_minutes', 'sleep_hours_per_night',
       'conflicts_over_social_media', 'affects_academic_performance',
       'addicted_score', 'mental_health_score'],
      dtype='object')

In [None]:
# Numerical and categorical columns
num_cols = ['age', 'average_daily_usage_minutes', 'sleep_per_night_minutes', 'conflicts_over_social_media', 'addicted_score', 'mental_health_score']
cat_cols = ['gender', 'relationship_status', 'academic_level', 'country', 'continents', 'most_used_platform', 'affects_academic_performance']

In [22]:
# Encode categoricals and scale numerical columns
preprocessor = ColumnTransformer([
    ('numeric', StandardScaler(), num_cols),
    ('categorical', OneHotEncoder(drop='first', sparse_output=False), cat_cols)
])

In [None]:
# Create cluster pipeline

cluster_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=10, random_state=42)),  # Placeholder; will adjust later based on explained variance
    ('kmeans', KMeans(n_clusters=10, random_state=42))  # Placeholder; will adjust later based on Elbow/Silhouette
])

In [39]:
X_processed = preprocessor.fit_transform(df_ml)

n_components = 10

pca_full = PCA(n_components=n_components).fit(X_processed)
x_PCA = pca_full.transform(X_processed)

ComponentsList = ["Component " + str(i+1) for i in range(n_components)]
dfExplVarRatio = pd.DataFrame(
    data=np.round(100 * pca_full.explained_variance_ratio_, 3),
    index=ComponentsList,
    columns=['Explained Variance Ratio (%)']
)

PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum()

print(f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
dfExplVarRatio

* The 10 components explain 86.14% of the data 



Unnamed: 0,Explained Variance Ratio (%)
Component 1,48.785
Component 2,13.448
Component 3,5.994
Component 4,4.498
Component 5,2.989
Component 6,2.76
Component 7,2.47
Component 8,2.207
Component 9,1.645
Component 10,1.346


In [40]:
X_processed = preprocessor.fit_transform(df_ml)

n_components = 13

pca_full = PCA(n_components=n_components).fit(X_processed)
x_PCA = pca_full.transform(X_processed)

ComponentsList = ["Component " + str(i+1) for i in range(n_components)]
dfExplVarRatio = pd.DataFrame(
    data=np.round(100 * pca_full.explained_variance_ratio_, 3),
    index=ComponentsList,
    columns=['Explained Variance Ratio (%)']
)

PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum()

print(f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
dfExplVarRatio

* The 13 components explain 88.77% of the data 



Unnamed: 0,Explained Variance Ratio (%)
Component 1,48.785
Component 2,13.448
Component 3,5.994
Component 4,4.498
Component 5,2.989
Component 6,2.76
Component 7,2.47
Component 8,2.207
Component 9,1.645
Component 10,1.346


In [41]:
# Update cluster pipeline, n_components to 13

cluster_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=13, random_state=42)),  # Placeholder; will adjust later based on explained variance
    ('kmeans', KMeans(n_clusters=10, random_state=42))  # Placeholder; will adjust later based on Elbow/Silhouette
])