In [103]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report, confusion_matrix

In [63]:
data_df = pd.read_csv('../data/cleaned/South_East_Asia_Social_Media_MentalHealth_cleaned.csv')

Experiment 1: Logistic Regression

In [88]:
x = data_df[['Daily SM Usage (hrs)', 'Peer Comparison Frequency (1-10)']]
y = data_df['Gender']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Female       0.49      0.50      0.50     33802
        Male       0.50      0.50      0.50     34382

    accuracy                           0.50     68184
   macro avg       0.50      0.50      0.50     68184
weighted avg       0.50      0.50      0.50     68184



In [94]:
x = data_df[['Daily SM Usage (hrs)', 'Peer Comparison Frequency (1-10)']]
y = data_df['Gender']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Female       0.50      0.50      0.50     33802
        Male       0.51      0.51      0.51     34382

    accuracy                           0.50     68184
   macro avg       0.50      0.50      0.50     68184
weighted avg       0.50      0.50      0.50     68184



The input data used for the prediction task were daily social media usage and peer comparison frequency. The output data used for the prediction task was the gender of the user. The model did not perform well. The model had a 50% accuracy. The model does not seem to be underfitting or overfitting. The problem could be address by using the random forest classifier method. The change in model did not help and the accuracy stayed the same. The potentially change on the data side could be to add age group to the input data.

Experiment 2: Linear Regression

In [106]:
x = data_df[['Peer Comparison Frequency (1-10)', 'Social Anxiety Level (1-10)']]
y = data_df['Self Confidence Impact (1-10)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 8.24512919838546
R-squared: -6.621079115975981e-05


In [115]:
x = data_df[['Peer Comparison Frequency (1-10)', 'Social Anxiety Level (1-10)']]
y = data_df['Self Confidence Impact (1-10)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

knn = KNeighborsRegressor(n_neighbors = 8)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")



Mean Squared Error: 9.214921893332747
R-squared: -0.11769407111368735


The input data used for the prediction task were peer comparison frequency and social anxiety level. The output data used for the prediction task  self confidence impact. The model did not perform well as the mean sqaured error was 8.245 and the r-sqaured value was -6.621e-05. The model does not seem to be underfitting or overfitting. The problem could be address by using the KNN Regression method. The change in model did not help as the mean squared error and r-sqaured value increased. The potentially change on the data side could be to add gender and sleep quality impact to the input data.

Experiment 3: Random Forest Regression

In [54]:
x = data_df[['Daily SM Usage (hrs)', 'Peer Comparison Frequency (1-10)', 'Social Anxiety Level (1-10)']]
y = data_df['Sleep Quality Impact (1-10)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=53)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 11.010449272606937
R-squared: -0.3311826558974773


In [109]:
x = data_df[['Daily SM Usage (hrs)', 'Peer Comparison Frequency (1-10)', 'Social Anxiety Level (1-10)']]
y = data_df['Sleep Quality Impact (1-10)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 8.27123164566551
R-squared: -6.433616974499401e-06


The input data used for the prediction task were daily social media usage, peer comparison frequency, and social anxiety level. The output data used for the prediction task was the sleep quality impact. The model did not perform well. The model had a mean sqaured error of 11.010 and the r-quared value of -0.331. The model does not seem to be underfitting or overfitting. The problem could be address by using the linear regression method. The change in model did help as the mean sqaured error changed to 8.271 and r-sqaured value changed to -6.434e-06. The model performance did improve but even after the change the model did not perform well. The potentially change on the data side could be to add the frequnecy of social media use to the input data.

Presentation

In [None]:
x = data_df[['Daily SM Usage (hrs)', 'Peer Comparison Frequency (1-10)', 'Likes Received (per post)', 'Comments Received (per post)']]
y = data_df['Cyberbullying Experience (1-10)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 8.284408831219354
R-squared: -1.0975440024596494e-05


In [None]:
x = data_df[['Daily SM Usage (hrs)', 'Peer Comparison Frequency (1-10)', 'Likes Received (per post)', 'Comments Received (per post)']]
y = data_df['Cyberbullying Experience (1-10)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 8.284408831219354
R-squared: -1.0975440024596494e-05
